In [1]:
# import core libraries 
import datetime
import json
import re
import csv
import ast
import pathlib
import itertools
from collections import Counter
from itertools import islice

# import third-party libraries
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from pandas import ExcelWriter


In [2]:
# set directory path data
twitter_data_dir = pathlib.Path('/Users/adamstueckrath/Desktop/twitter_data/')

# tweets_no_rts_csv file path
tweets_no_rts_csv = twitter_data_dir / 'tweets_no_retweets' / 'tweets_no_retweets.csv'


In [3]:
def string_to_datetime(tweet_date):
    """
    Turns a datetime string like this: 
    '2017-07-06T18:34:37.000Z' 
    to a Python datetime object like this -> 2017-07-06 18:34:41
    """
    return datetime.datetime.strptime(tweet_date, "%Y-%m-%dT%H:%M:%S.%fZ")



In [4]:
# load tweets into dataframe from csv file
tweets_no_rts_df = pd.read_csv(tweets_no_rts_csv, header=0,
                               parse_dates=['tweet_created_at'], 
                               date_parser=string_to_datetime)


In [5]:
tweets_no_rts_df_en = tweets_no_rts_df.copy()
tweets_no_rts_df_en = tweets_no_rts_df_en[tweets_no_rts_df_en['tweet_lang'] =='en']

In [6]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
analyzer = SentimentIntensityAnalyzer()
def clean_tweet(tweet):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    tweet = tweet.lower()
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def analyze_sentiment(tweet):
    '''
    Utility function to classify the polarity of a tweet
    using nltk. analysis variable returns the following dict: 
    {'neg': 0.122, 'neu': 0.641, 'pos': 0.237, 'compound': 0.4215}
    The compound value here conveys the overall positive or negative user experience.
    Examples: 
    https://www.programcreek.com/python/example/100005/nltk.sentiment.vader.SentimentIntensityAnalyzer
    https://opensourceforu.com/2016/12/analysing-sentiments-nltk/
    '''
    analysis = analyzer.polarity_scores(clean_tweet(tweet))
    if analysis['compound'] > 0.1:
        return 1
    elif analysis['compound'] == 0:
        return 0
    else:
        return -1
    



In [7]:
tweets_no_rts_df_en['tweet_text_clean'] = tweets_no_rts_df_en['tweet_text'].apply(clean_tweet)
tweets_no_rts_df_en['tweet_text_sentiment'] = tweets_no_rts_df_en['tweet_text_clean'].apply(analyze_sentiment)


In [8]:
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,2))

In [53]:
test = tweets_no_rts_df_en.copy()[:1000000]
test.shape

(638161, 31)

In [54]:
vectorized_data = count_vectorizer.fit_transform(test.tweet_text)
indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))

In [55]:
def sentiment2target(sentiment):
    return {
        -1: 0,
        0: 1,
        1: 2
    }[sentiment]
targets = test.tweet_text_sentiment.apply(sentiment2target)

In [None]:
from sklearn.model_selection import train_test_split
data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, targets, test_size=0.4, random_state=0)
data_train_index = data_train[:,0]
data_train = data_train[:,1:]
data_test_index = data_test[:,0]
data_test = data_test[:,1:]


In [56]:
indexed_data.shape

(638161, 2986500)

In [57]:
len(targets)

638161

In [58]:
import time
import numpy as np
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

# iris = datasets.load_iris()
# X, y = iris.data, iris.target
X, y = indexed_data, targets


# start = time.time()
# clf = OneVsRestClassifier(SVC(kernel='linear', probability=True, class_weight='balanced'))
# clf.fit(X, y)
# end = time.time()
# print ("Single SVC", end - start, clf.score(X,y))
# proba = clf.predict_proba(X)

# n_estimators = 10
# start = time.time()
# clf = OneVsRestClassifier(BaggingClassifier(SVC(kernel='linear', probability=True, class_weight='balanced'), max_samples=1.0 / n_estimators, n_estimators=n_estimators))
# clf.fit(X, y)
# end = time.time()
# print ("Bagging SVC", end - start, clf.score(X,y))
# proba = clf.predict_proba(X)

start = time.time()
clf = RandomForestClassifier(min_samples_leaf=20)
clf.fit(X, y)
end = time.time()
print ("Random Forest", end - start, clf.score(X,y))
proba = clf.predict_proba(X)




Random Forest 37.56977891921997 0.6976609350931818


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
analyzer = SentimentIntensityAnalyzer()
def clean_tweet(tweet):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    tweet = tweet.lower()
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def analyze_sentiment(tweet):
    '''
    Utility function to classify the polarity of a tweet
    using nltk. analysis variable returns the following dict: 
    {'neg': 0.122, 'neu': 0.641, 'pos': 0.237, 'compound': 0.4215}
    The compound value here conveys the overall positive or negative user experience.
    Examples: 
    https://www.programcreek.com/python/example/100005/nltk.sentiment.vader.SentimentIntensityAnalyzer
    https://opensourceforu.com/2016/12/analysing-sentiments-nltk/
    '''
    analysis = analyzer.polarity_scores(clean_tweet(tweet))
    if analysis['compound'] > 0.1:
        return 1
    elif analysis['compound'] == 0:
        return 0
    else:
        return -1
    

In [None]:
test = 'Assad loses 3 generals during the first week of July #Syria #RevolutionWins #militia #Free_Syrian_Army\nhttps://t.co/5x8UbVOnH9'

testing = clean_tweet(test)
print(testing)


In [None]:
tweets_no_rts_df_en = tweets_no_rts_df.copy()
tweets_no_rts_df_en = tweets_no_rts_df_en[tweets_no_rts_df_en['tweet_lang'] =='en']


In [None]:
tweets_no_rts_df_en['tweet_text_clean'] = tweets_no_rts_df_en['tweet_text'].apply(clean_tweet)
tweets_no_rts_df_en['tweet_text_sentiment'] = tweets_no_rts_df_en['tweet_text_clean'].apply(analyze_sentiment)


In [None]:
t = tweets_no_rts_df_en['tweet_text_sentiment'].value_counts().to_dict()
print(t)


In [None]:
tweet_text = tweets_no_rts_df_en['tweet_text_clean']
tweet_text_list = tweet_text.tolist()
print(len(tweet_text_list))
