# Twitter Sentiment Analysis

## Loading the Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train = pd.read_csv('../training.1600000.processed.noemoticon.csv', encoding='iso-8859-1', header=None, names=['Sentiment','Id', 'Date', 'Query', 'User', 'Text'])

In [3]:
train.head()

Unnamed: 0,Sentiment,Id,Date,Query,User,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
train.drop(columns=['Id','Query','User','Date'], axis=1, inplace=True)

In [5]:
train.head()

Unnamed: 0,Sentiment,Text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Cleaning the Data

In [50]:
import re
import string
from nltk.stem import PorterStemmer
#from nltk.corpus import stopwords

#create an object of class PorterStemmer
porter = PorterStemmer()

#edited_stop_words = [re.sub('[\']+', '',word) for word in stopwords.words('english')]

auxiliaryVerbs = ['do','does','did','has','have','had','should','must','can','could']
splitNegativeWords = re.compile(r'('+r'|'.join(auxiliaryVerbs)+r')n?\'t')

def prepare_tweet(tweet_text):
    # Removing Hashtags
    tweet_aux = re.sub('(^|\ )#[^ ]+', '', str.lower(tweet_text))
    # Removing mentions
    tweet_aux = re.sub('(^|\ )@[^ ]+', '', tweet_aux)
    # Removing URLs
    tweet_aux =  re.sub('https?://[^ ]+', '', tweet_aux)
    tweet_aux =  re.sub('www.[^ ]+', '', tweet_aux)
    # Removing symbols and numbers
    tweet_aux = re.sub('[^A-Za-z \n]+', '', tweet_aux)
    ## Remove stopwords - Commented because resulted in a worse accuracy
    ## tweet_aux =  [word for word in tweet.split() if word not in edited_stop_words]
    # Splitting contracted negative auxiliary verbs
    tweet_aux = splitNegativeWords.sub("\\1 not", tweet_aux)
    # Stemming
    stem_tokens = []
    for token in tweet_aux.split():
        stem_tokens.append(porter.stem(token))
    return ' '.join(stem_tokens)

#pattern_mentions = re.compile("(^|\ )@[A-Za-z0-9\-_]+")
#
#def is_there_mentions(tweet):
#    if pattern_mentions.search(tweet):
#        return 1
#    else:
#        return 0
#
#def num_words(processed_tweet):
#    return len(processed_tweet.split())


In [7]:
number_rows = 1000000
train_sample = pd.concat([train[0:int(number_rows/2)].copy(),train[800000:800000+int(number_rows/2)].copy()])

In [8]:
import datetime
datetime.datetime.now()

datetime.datetime(2019, 2, 19, 15, 26, 35, 614655)

In [9]:
train_sample['Tokens'] = train_sample['Text'].apply(prepare_tweet)
# train_sample['Mentions'] = train_sample['Text'].apply(is_there_mentions)
# train_sample['Num_words'] = train_sample['Tokens'].apply(num_words)

In [10]:
train_sample.head(10)

Unnamed: 0,Sentiment,Text,Tokens
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww that a bummer you shoulda got david carr ...
1,0,is upset that he can't update his Facebook by ...,is upset that he cant updat hi facebook by tex...
2,0,@Kenichan I dived many times for the ball. Man...,i dive mani time for the ball manag to save th...
3,0,my whole body feels itchy and like its on fire,my whole bodi feel itchi and like it on fire
4,0,"@nationwideclass no, it's not behaving at all....",no it not behav at all im mad whi am i here be...
5,0,@Kwesidei not the whole crew,not the whole crew
6,0,Need a hug,need a hug
7,0,@LOLTrish hey long time no see! Yes.. Rains a...,hey long time no see ye rain a bit onli a bit ...
8,0,@Tatiana_K nope they didn't have it,nope they didnt have it
9,0,@twittera que me muera ?,que me muera


In [11]:
datetime.datetime.now()

datetime.datetime(2019, 2, 19, 15, 29, 48, 259760)

In [12]:
# for word in top_words:
#     train_sample[word] = int(word in train_sample['Tokens'])

In [13]:
train_sample.reset_index(inplace=True)
train_sample.drop(columns=['index'], axis=1, inplace=True)
# train_sample.drop(columns=['index', 'level_0'], axis=1, inplace=True)

In [14]:
train_sample.head()

Unnamed: 0,Sentiment,Text,Tokens
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww that a bummer you shoulda got david carr ...
1,0,is upset that he can't update his Facebook by ...,is upset that he cant updat hi facebook by tex...
2,0,@Kenichan I dived many times for the ball. Man...,i dive mani time for the ball manag to save th...
3,0,my whole body feels itchy and like its on fire,my whole bodi feel itchi and like it on fire
4,0,"@nationwideclass no, it's not behaving at all....",no it not behav at all im mad whi am i here be...


In [15]:
import csv
import collections

with open('./word_count_100k.csv', 'r') as csvfile: 
    w = csv.DictReader(csvfile)
    word_count_aux = list(w)[0]

In [16]:
word_count_dict = {key:int(value) for key,value in word_count_aux.items()}

In [17]:
word_count = collections.Counter(word_count_dict)

In [18]:
word_count.most_common(10)

[('to', 36785),
 ('the', 32800),
 ('I', 30090),
 ('a', 23781),
 ('it', 19995),
 ('and', 18962),
 ('my', 16456),
 ('you', 16412),
 ('is', 14827),
 ('i', 14611)]

In [19]:
top_words = [word for (word, count) in word_count.most_common(2000)]

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
datetime.datetime.now()

datetime.datetime(2019, 2, 19, 15, 29, 48, 572922)

In [22]:
count = CountVectorizer(vocabulary=top_words, token_pattern=r"(?u)\b\w+\b")
bag_of_words = count.transform(train_sample['Tokens'])



In [23]:
# #count = CountVectorizer(token_pattern=r"(?u)\b\w+\b",ngram_range=(1,2), max_features=2000)
# count = CountVectorizer(ngram_range=(1,2), max_features=2000)
# bag_of_words = count.fit_transform(train_sample['Tokens'])


In [24]:
# Show feature matrix
# bow_set = bag_of_words.astype(np.int16).toarray()

In [25]:
datetime.datetime.now()

datetime.datetime(2019, 2, 19, 15, 29, 58, 377736)

In [26]:
count

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w+\\b', tokenizer=None,
        vocabulary=['to', 'the', 'I', 'a', 'it', 'and', 'my', 'you', 'is', 'i', 'in', 'for', 'of', 'that', 'have', 'on', 'me', 'go', 'but', 'be', 'just', 'so', 'with', 'get', 'not', 'at', 'day', 'Im', 'wa', 'work', 'thi', 'good', 'up', 'now', 'your', 'do', 'out', 'like', 'all', 'are', 'love', 'today', 'time...it', 'proper', 'yah', 'nerd', 'unfollow', 'mama', 'bubbl', 'intern', 'sweat', 'tempt', 'bah', 'msg'])

In [27]:
# bag_of_words.astype(np.int16).toarray() inline to save 2GB of RAM not creating the variable bow_set
train_sample = pd.concat([train_sample, pd.DataFrame(bag_of_words.astype(np.int8).toarray(), columns=top_words, dtype=np.int8)],sort=False,axis=1,join='inner')

## Logistic Regression Model

In [28]:
training_set = train_sample.drop(columns=['Text', 'Tokens'], axis=1)

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(training_set.drop('Sentiment',axis=1), 
                                                    training_set['Sentiment'], test_size=0.30, 
                                                    random_state=101)

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
# logmodel = LogisticRegression(solver='lbfgs')
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [33]:
datetime.datetime.now()

datetime.datetime(2019, 2, 19, 15, 31, 58, 446651)

In [34]:
predictions = logmodel.predict(X_test)

In [35]:
logmodel.intercept_

array([0.393645])

In [36]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [37]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.79      0.75      0.77    150146
          4       0.76      0.81      0.78    149854

avg / total       0.78      0.78      0.78    300000



In [38]:
print(accuracy_score(y_test,predictions))

0.77847


In [149]:
training_set.head()

Unnamed: 0,Sentiment,to,the,I,a,it,and,my,you,is,...,yah,nerd,unfollow,mama,bubbl,intern,sweat,tempt,bah,msg
0,0,1,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

def sentiment(tweet_text):
    clean_tweet = prepare_tweet(tweet_text)
    counter = CountVectorizer(vocabulary=top_words, token_pattern=r"(?u)\b\w+\b")
    tweet_bag_of_words = counter.transform([clean_tweet])
    tweet_bow_df = pd.DataFrame(tweet_bag_of_words.astype(np.int8).toarray(), columns=top_words, dtype=np.int8)
    return logmodel.predict(tweet_bow_df)

sentiment("Fuck this shit Trump")

array([0], dtype=int64)

## Testing Ways to Save the Trained Model

Using Pickle to serialize the model and save to file:

In [45]:
import pickle
pickle.dump(logmodel, open('twitterML.model', 'wb'), protocol=2)

Saving the model coefficients to a CSV file:

In [44]:
import pickle
logmodel3 = pickle.load(open('twitterML.model', 'rb'))
predictions3 = logmodel3.predict(X_test)
print(accuracy_score(y_test,predictions3))

0.77847


## Checking Missclassified Tweets

In [502]:
pd.set_option('display.max_colwidth', -1)
errors = (y_test != pd.Series(predictions, index=y_test.index))
pd.concat([train_sample[['Text','Tokens']],y_test,pd.Series(predictions, index=y_test.index, name='Prediction')],sort=False,axis=1,join='inner').loc[errors[errors==True].sort_index().keys().tolist()]



Unnamed: 0,Text,Tokens,Sentiment,Prediction
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,i dive mani time for the ball manag to save the rest go out of bound,0,4
7,"@LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit LOL , I'm fine thanks , how's you ?",hey long time no see ye rain a bit onli a bit lol im fine thank how you,0,4
9,@twittera que me muera ?,que me muera,0,4
22,@angry_barista I baked you a cake but I ated it,i bake you a cake but i ate it,0,4
36,@cocomix04 ill tell ya the story later not a good day and ill be workin for like three more hours...,ill tell ya the stori later not a good day and ill be workin for like three more hour,0,4
45,@Viennah Yay! I'm happy for you with your job! But that also means less time for me and you...,yay im happi for you with your job but that also mean less time for me and you,0,4
86,@msdrama hey missed ya at the meeting sup mama,hey miss ya at the meet sup mama,0,4
91,wednesday my b-day! don't know what 2 do!!,wednesday my bday dont know what do,0,4
98,"@stark YOU don't follow me, either and i work for you!",you dont follow me either and i work for you,0,4
99,A bad nite for the favorite teams: Astros and Spartans lose. The nite out with T.W. was good.,a bad nite for the favorit team astro and spartan lose the nite out with tw wa good,0,4


## Testing Grid Search

In [71]:
param_grid = {'C': [0.1,1, 10], 'intercept_scaling': [10,1,0.1,], 'solver': ['lbfgs', 'liblinear']} 

In [72]:
from sklearn.model_selection import GridSearchCV

In [73]:
grid = GridSearchCV(LogisticRegression(),param_grid,refit=True,verbose=3)

In [74]:
grid.fit(X_train,y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] C=0.1, intercept_scaling=10, solver=lbfgs .......................
[CV]  C=0.1, intercept_scaling=10, solver=lbfgs, score=0.7791192025165642, total=   5.0s
[CV] C=0.1, intercept_scaling=10, solver=lbfgs .......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.0s remaining:    0.0s


[CV]  C=0.1, intercept_scaling=10, solver=lbfgs, score=0.7769111098730141, total=   4.7s
[CV] C=0.1, intercept_scaling=10, solver=lbfgs .......................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.9s remaining:    0.0s


[CV]  C=0.1, intercept_scaling=10, solver=lbfgs, score=0.7775453964934236, total=   4.7s
[CV] C=0.1, intercept_scaling=10, solver=liblinear ...................
[CV]  C=0.1, intercept_scaling=10, solver=liblinear, score=0.7791363453247276, total=   8.0s
[CV] C=0.1, intercept_scaling=10, solver=liblinear ...................
[CV]  C=0.1, intercept_scaling=10, solver=liblinear, score=0.7769625385179122, total=   8.0s
[CV] C=0.1, intercept_scaling=10, solver=liblinear ...................
[CV]  C=0.1, intercept_scaling=10, solver=liblinear, score=0.7776182537403625, total=   8.3s
[CV] C=0.1, intercept_scaling=1, solver=lbfgs ........................
[CV]  C=0.1, intercept_scaling=1, solver=lbfgs, score=0.7791192025165642, total=   4.9s
[CV] C=0.1, intercept_scaling=1, solver=lbfgs ........................
[CV]  C=0.1, intercept_scaling=1, solver=lbfgs, score=0.7769111098730141, total=   4.7s
[CV] C=0.1, intercept_scaling=1, solver=lbfgs ........................
[CV]  C=0.1, intercept_scaling

[CV]  C=10, intercept_scaling=0.1, solver=liblinear, score=0.7777939682770976, total=  10.6s


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  7.0min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10], 'intercept_scaling': [10, 1, 0.1], 'solver': ['lbfgs', 'liblinear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [67]:
grid.best_params_

{'C': 0.1, 'intercept_scaling': 0.1}

In [68]:
grid_predictions = grid.predict(X_test)

In [69]:
print(classification_report(y_test,grid_predictions))

             precision    recall  f1-score   support

          0       0.80      0.74      0.77    150146
          4       0.76      0.81      0.78    149854

avg / total       0.78      0.78      0.78    300000



In [75]:
print(accuracy_score(y_test,grid_predictions))

0.7775533333333333
