In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from string import punctuation
from itertools import chain
from string import digits

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, accuracy_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_selection import SelectPercentile, chi2, f_regression, f_classif

from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
nltk.download('stopwords')
nltk.download('wordnet')

from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin1') 

In [9]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [10]:
df.columns = ['polarity', 'tweet_id', 'date', 'query', 'user', 'tweet',]

In [11]:
df.head()

Unnamed: 0,polarity,tweet_id,date,query,user,tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


***
### 1. Create sentiment column to classify the sentiment of tweets using the polarity

According to the dataset, 0 means negative, 2 means neutral and 4 means positive. We do not need neutral tweets as they do not add value to our analysis of the tweet sentiment, so it would've been dropped, however, the dataset already had it dropped.

In [12]:
pos = df[df['polarity'] == 4] 
neg = df[df['polarity'] == 0]
neutral = df[df['polarity']==2]
print(pos.shape, neg.shape, neutral.shape)
#Gives us insight into the ratings and how they shape the data. 

(800000, 6) (799999, 6) (0, 6)


In [13]:
df = shuffle(df) #shuffled to make the data more represntative when used for the following operations of testing, training etc.
df.head()
#Reference: https://towardsdatascience.com/shuffling-rows-in-pandas-dataframes-eda052275635

Unnamed: 0,polarity,tweet_id,date,query,user,tweet
956413,4,1825221880,Sun May 17 04:51:55 PDT 2009,NO_QUERY,xlove_katiex,that voice in side your head saying... your ju...
426253,0,2063496279,Sun Jun 07 02:53:52 PDT 2009,NO_QUERY,Headphaze,My tweets from the Meadows Festival have arriv...
592208,0,2217890027,Wed Jun 17 21:32:40 PDT 2009,NO_QUERY,FreightTrainn,Ew. There is a massive slug thing on my side w...
785687,0,2324489615,Thu Jun 25 03:17:08 PDT 2009,NO_QUERY,hambers,"@g33kgurrl Man, I hate that. Arghhh. I was tr..."
302537,0,1999059667,Mon Jun 01 19:34:24 PDT 2009,NO_QUERY,Ally_KM,@4hoursstanding Feeling horrible today but I l...


In [14]:
y_map = {0:0, 4:1} #Map out values, 1 and 2 should map to 0 meaning a negative review, and 4 and 5 map to 1 for pos.
y = df['polarity'].map(y_map) #Now if we do df['column'].map we transform the data 1,2,4,5 into 0 and 1 making it binary

In [15]:
df['sentiment'] = y

In [16]:
df.head()

Unnamed: 0,polarity,tweet_id,date,query,user,tweet,sentiment
956413,4,1825221880,Sun May 17 04:51:55 PDT 2009,NO_QUERY,xlove_katiex,that voice in side your head saying... your ju...,1
426253,0,2063496279,Sun Jun 07 02:53:52 PDT 2009,NO_QUERY,Headphaze,My tweets from the Meadows Festival have arriv...,0
592208,0,2217890027,Wed Jun 17 21:32:40 PDT 2009,NO_QUERY,FreightTrainn,Ew. There is a massive slug thing on my side w...,0
785687,0,2324489615,Thu Jun 25 03:17:08 PDT 2009,NO_QUERY,hambers,"@g33kgurrl Man, I hate that. Arghhh. I was tr...",0
302537,0,1999059667,Mon Jun 01 19:34:24 PDT 2009,NO_QUERY,Ally_KM,@4hoursstanding Feeling horrible today but I l...,0


In [17]:
df.dtypes

polarity      int64
tweet_id      int64
date         object
query        object
user         object
tweet        object
sentiment     int64
dtype: object

***
### 2. Cleaning the data

1. Convert to lowercase
2. Rmove numbers
3. Remove punctuation marks 
4. Remove HTML tags
5. Convert emoticons to strings
6. Tokenize the data
7. Remove stop words
8. Stemming

In [47]:
def convert_to_lowercase(text):
    return text.lower()

In [48]:
def remove_nums(text):
    no_nums = "".join([i for i in text if i not in string.digits])
    return no_nums

In [49]:
def remove_punctuations(text):
    no_puncts = "".join([c for c in text if c not in string.punctuation])
    return no_puncts

In [50]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'lxml')
    no_html_tags = soup.get_text()
    return no_html_tags

In [51]:
###REPLACE EMOTICONS, HOWEVER, THE DATASET ALREADY REPLACED THEM

In [52]:
tokenizer = RegexpTokenizer('\w+')

In [53]:
en_stopwords = set(stopwords.words('english'))

In [54]:
def remove_stopwords(text):
    words = [w for w in text if w not in en_stopwords]
    return words

In [55]:
ps = PorterStemmer()

In [56]:
def word_stemmer(text):
    stemmed_text = " ".join([ps.stem(i) for i in text])
    return stemmed_text

In [74]:
def clean_text(text):
    cleaned_text = text.apply(convert_to_lowercase)
    cleaned_text = cleaned_text.apply(remove_nums)
    cleaned_text = cleaned_text.apply(remove_punctuations)
    cleaned_text = cleaned_text.apply(remove_html_tags)
    cleaned_text = cleaned_text.apply(lambda x: tokenizer.tokenize(x.lower()))
    cleaned_text = cleaned_text.apply(lambda x: remove_stopwords(x))
    cleaned_text = cleaned_text.apply(lambda x: word_stemmer(x))
    return cleaned_text

In [75]:
dftext = df['tweet']
dftext

956413    that voice in side your head saying... your ju...
426253    My tweets from the Meadows Festival have arriv...
592208    Ew. There is a massive slug thing on my side w...
785687    @g33kgurrl Man, I hate that. Arghhh.  I was tr...
302537    @4hoursstanding Feeling horrible today but I l...
                                ...                        
336765    @ahmedzainal oh noooooo that will bring lots o...
276482        Grrrrrr. iPod still in UPS' hub in Cologne!! 
466467    bloody hell, made a cuppa and put it down and ...
929837          @wantsum67 i'm ready for the back rub..... 
29586     shit the metric show is on tues and i didnt en...
Name: tweet, Length: 1599999, dtype: object

In [77]:
cleaned_tweets = clean_text(dftext)

In [79]:
df['cleaned_tweet'] = cleaned_tweets

In [80]:
df.head()

Unnamed: 0,polarity,tweet_id,date,query,user,tweet,sentiment,cleaned_tweet
956413,4,1825221880,Sun May 17 04:51:55 PDT 2009,NO_QUERY,xlove_katiex,that voice in side your head saying... your ju...,1,voic side head say justjust dummyyyy hm
426253,0,2063496279,Sun Jun 07 02:53:52 PDT 2009,NO_QUERY,Headphaze,My tweets from the Meadows Festival have arriv...,0,tweet meadow festiv arriv late must got stuck ...
592208,0,2217890027,Wed Jun 17 21:32:40 PDT 2009,NO_QUERY,FreightTrainn,Ew. There is a massive slug thing on my side w...,0,ew massiv slug thing side walk
785687,0,2324489615,Thu Jun 25 03:17:08 PDT 2009,NO_QUERY,hambers,"@g33kgurrl Man, I hate that. Arghhh. I was tr...",0,gkgurrl man hate arghhh tri demo yesterday dam...
302537,0,1999059667,Mon Jun 01 19:34:24 PDT 2009,NO_QUERY,Ally_KM,@4hoursstanding Feeling horrible today but I l...,0,hoursstand feel horribl today look like pirat ...


***
### Classification

In [81]:
#Reference: Lab 5
def text_fit(X, y, model, clf_model, coef_show=1): 
    X_c = model.fit_transform(X) 
    print('# features: {}'.format(X_c.shape[1]))
    X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state=0) #Splits the data into one for training and one for testing
    print('# train records: {}'.format(X_train.shape[0]))
    print('# test records: {}'.format(X_test.shape[0]))
    clf = clf_model.fit(X_train, y_train) #Fir the logistic reg model with the training data
    y_pred = clf.predict(X_test)  #Prediction using the Test data
    recall = recall_score(y_test,y_pred) #Calculate the recall score between actual predictions and model predictions
    accuracy = accuracy_score(y_test,y_pred)
    f1score = f1_score(y_test, y_pred) 
    print ('Model Recall: {}'.format(recall))
    if coef_show == 1:  #Extract the coefficients from the model and put it in a dataframe
        #print(confusion_matrix(y_test, y_pred))
        w = model.get_feature_names()
        coef = clf.coef_.tolist()[0]
        coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
        coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
        print('')
        print('-Top 20 positive-')
        print(coeff_df.head(20).to_string(index=False))
        print('')
        print('-Top 20 negative-')        
        print(coeff_df.tail(20).to_string(index=False))
        return recall

In [82]:
X = df['cleaned_tweet']
y = df['sentiment']

In [83]:
tfidf = TfidfVectorizer()

In [85]:
tfidf_n = TfidfVectorizer(ngram_range=(1,2))

In [86]:
svm_tfidf_n = text_fit(X, y, tfidf_n, LinearSVC())

# features: 5200732
# train records: 1199999
# test records: 400000
Model Recall: 0.8192872243129377





-Top 20 positive-
           Word  Coefficient
      cant wait     6.323890
      wish luck     4.311233
    cannot wait     4.296175
    doesnt hurt     4.209040
          thank     3.773866
     dont worri     3.721359
wont disappoint     3.497343
        sad sad     3.492039
     noth wrong     3.463514
       isnt bad     3.433893
    dont forget     3.374591
          smile     3.308028
      wasnt bad     3.243278
     sad anymor     3.075254
     never fail     3.012563
      dont miss     2.968744
       aint bad     2.928135
    wont regret     2.908489
       cant bad     2.850790
    fair enough     2.842829

-Top 20 negative-
      Word  Coefficient
       cri    -3.901289
     broke    -3.946179
      suck    -3.977304
     upset    -3.977613
      hurt    -4.034242
    bummer    -4.068395
      hate    -4.086650
    cancel    -4.138764
      lost    -4.311724
       rip    -4.312589
   depress    -4.360937
      sick    -4.492867
disappoint    -4.510793
      wish    -4.

In [87]:
lr_tfidf_n = text_fit(X, y, tfidf_n, LogisticRegression())

# features: 5200732
# train records: 1199999
# test records: 400000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Recall: 0.8266488747619833





-Top 20 positive-
       Word  Coefficient
  cant wait    16.132923
      thank    13.052376
  wish luck     9.584853
     welcom     8.988087
      smile     8.921259
cannot wait     8.436763
     awesom     7.600849
       love     7.500623
 dont worri     7.484573
       hehe     7.418901
  wasnt bad     7.354722
        yay     7.089457
    congrat     6.910141
dont forget     6.866258
   isnt bad     6.838272
 noth wrong     6.826312
  congratul     6.743924
  dont miss     6.678214
      proud     6.472509
      excit     6.444141

-Top 20 negative-
      Word  Coefficient
     sorri    -9.134026
       rip    -9.156481
      lost    -9.390140
disappoint    -9.458118
   depress    -9.483757
       cri    -9.677994
      hurt    -9.760371
       ugh    -9.914124
     broke   -10.131570
      suck   -10.170157
   headach   -10.180339
  unfortun   -10.453134
      sick   -10.661646
     sadli   -10.867687
      hate   -11.599040
      wish   -12.517673
      cant   -12.905378
     

***
### Topic Modelling Using NMF

In [97]:
#Reference: Lab 5
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print("\n")
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])
            print("\n")

In [98]:
documents = list(X)
print(len(documents))

1599999


In [99]:
#TFIDF model using NMF
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()



In [101]:
no_topics = 10

In [102]:
#Run NMF
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_



In [103]:
no_top_words = 20
no_top_documents = 4
print("NMF Topics \n\n")
display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)

NMF Topics 


Topic 0:
im sorri tire gonna bore sad think sick still sure happi lol right excit realli glad hungri watch haha oh


isabelbm im


im im


yepcaitlinburn im


im jblphotograpghi


Topic 1:
work back today tomorrow hour still weekend readi doesnt home tire bore got earli week way sunday hard isnt tri


work


work


work


work


Topic 2:
good morn night luck everyon hope feel time that today sound look world thing twitter last bed pretti afternoon great


pixiepop good morn good night


good morn good night


good night good morn


good morn good night


Topic 3:
go want bed home wanna tomorrow school back sleep see today tonight wish rain away shop beach night readi soon


go studiofin


go


go


ammagawd go


Topic 4:
day today happi mother last great school anoth long beauti nice tomorrow bad hope one everyon birthday first rain start


day


beautifaul day


epicphenom day


day vacationwoooooohoooooo


Topic 5:
dont feel like know want realli think well look today m