In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from string import punctuation
from itertools import chain
from string import digits

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, accuracy_score,confusion_matrix, classification_report

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_selection import SelectPercentile, chi2, f_regression, f_classif

from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
nltk.download('stopwords')
nltk.download('wordnet')

from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TweetTokenizer
from emoji import demojize

import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin1') 

In [3]:
df1 = pd.read_csv('testdata.manual.2009.06.14.csv') 

In [4]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [5]:
df.columns = ['polarity', 'tweet_id', 'date', 'query', 'user', 'tweet',]

In [6]:
df1.columns = ['polarity', 'tweet_id', 'date', 'query', 'user', 'tweet',]

In [7]:
df1.head()

Unnamed: 0,polarity,tweet_id,date,query,user,tweet
0,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
1,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
2,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
3,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
4,4,8,Mon May 11 03:22:00 UTC 2009,kindle2,GeorgeVHulme,@richardebaker no. it is too big. I'm quite ha...


In [8]:
df.head()

Unnamed: 0,polarity,tweet_id,date,query,user,tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


***
### 1. Create sentiment column to classify the sentiment of tweets using the polarity

According to the dataset, 0 means negative, 2 means neutral and 4 means positive. We do not need neutral tweets as they do not add value to our analysis of the tweet sentiment, so it would've been dropped, however, the dataset already had it dropped.

In [9]:
pos = df[df['polarity'] == 4] 
neg = df[df['polarity'] == 0]
neutral = df[df['polarity']==2]
print(pos.shape, neg.shape, neutral.shape)
#Gives us insight into the polarity and how they shape the data. 

(800000, 6) (799999, 6) (0, 6)


In [10]:
df = shuffle(df) #shuffled to make the data more represntative when used for the following operations of testing, training etc.
df.head()
#Reference: https://towardsdatascience.com/shuffling-rows-in-pandas-dataframes-eda052275635

Unnamed: 0,polarity,tweet_id,date,query,user,tweet
68092,0,1692674612,Sun May 03 20:31:16 PDT 2009,NO_QUERY,lizoutline,had that nightmare again where i wake up and m...
1336530,4,2017267875,Wed Jun 03 08:07:09 PDT 2009,NO_QUERY,helloimwee,work bleh ha come visit today
423587,0,2062906222,Sun Jun 07 00:41:55 PDT 2009,NO_QUERY,pempin96,School is finally over!!! :] Yet....its upsett...
1197113,4,1984916790,Sun May 31 16:03:48 PDT 2009,NO_QUERY,KSuds1313,"@YoungQ Hope you're not missing Va Bch, DC, Jo..."
662177,0,2243513264,Fri Jun 19 13:32:41 PDT 2009,NO_QUERY,KatyJW,I'm sad because my best friend Brittany is not...


In [11]:
y_map = {0:0, 4:1} #Map out values, 0 means a negative polarity, so map to 0, and 4 means positive polarity so map 4 to 1 for pos.
y = df['polarity'].map(y_map) #Now if we do df['column'].map we transform the data 0,4 into 0 and 1.

In [12]:
df['sentiment'] = y

In [13]:
df.head()

Unnamed: 0,polarity,tweet_id,date,query,user,tweet,sentiment
68092,0,1692674612,Sun May 03 20:31:16 PDT 2009,NO_QUERY,lizoutline,had that nightmare again where i wake up and m...,0
1336530,4,2017267875,Wed Jun 03 08:07:09 PDT 2009,NO_QUERY,helloimwee,work bleh ha come visit today,1
423587,0,2062906222,Sun Jun 07 00:41:55 PDT 2009,NO_QUERY,pempin96,School is finally over!!! :] Yet....its upsett...,0
1197113,4,1984916790,Sun May 31 16:03:48 PDT 2009,NO_QUERY,KSuds1313,"@YoungQ Hope you're not missing Va Bch, DC, Jo...",1
662177,0,2243513264,Fri Jun 19 13:32:41 PDT 2009,NO_QUERY,KatyJW,I'm sad because my best friend Brittany is not...,0


In [14]:
df.dtypes

polarity      int64
tweet_id      int64
date         object
query        object
user         object
tweet        object
sentiment     int64
dtype: object

***
### 2. Cleaning the data

1. Convert to lowercase
2. Rmove numbers
3. Remove punctuation marks 
4. Remove HTML tags
5. Convert emoticons to strings
6. Tokenize the data
7. Remove stop words
8. Stemming

In [15]:
def remove_usernames_links(text):
    text = re.sub('@[^\s]+','',str(text))
    text = re.sub('http[^\s]+','',str(text))
    return text

In [16]:
def convert_to_lowercase(text):
    return text.lower()

In [17]:
def remove_nums(text):
    no_nums = "".join([i for i in text if i not in string.digits])
    return no_nums

In [18]:
def remove_punctuations(text):
    no_puncts = "".join([c for c in text if c not in string.punctuation])
    return no_puncts

In [19]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'lxml')
    no_html_tags = soup.get_text()
    return no_html_tags

In [20]:
def remove_emojis(text):
    no_emoji = demojize(text)
    return no_emoji

In [21]:
tokenizer = RegexpTokenizer('\s+', gaps = True)

In [22]:
tk = TweetTokenizer()

In [23]:
en_stopwords = set(stopwords.words('english'))

In [24]:
def remove_stopwords(text):
    words = [w for w in text if w not in en_stopwords]
    return words

In [25]:
lemmatizer = WordNetLemmatizer()

In [26]:
def word_lemmatizer(text):
    lemmatized_text = " ".join([lemmatizer.lemmatize(i) for i in text])
    return lemmatized_text

In [27]:
def clean_text(text):
    cleaned_text = text.apply(remove_usernames_links)
    cleaned_text = cleaned_text.apply(convert_to_lowercase)
    cleaned_text = cleaned_text.apply(remove_nums)
    cleaned_text = cleaned_text.apply(remove_emojis) 
    cleaned_text = cleaned_text.apply(remove_punctuations)
    cleaned_text = cleaned_text.apply(remove_html_tags)
    cleaned_text = cleaned_text.apply(tokenizer.tokenize)
    cleaned_text = cleaned_text.apply(lambda x: remove_stopwords(x))
    cleaned_text = cleaned_text.apply(lambda x: word_lemmatizer(x))
    return cleaned_text

In [28]:
def clean(text):
    cleaned_text = []
    
    #text = text.lower()
    text = remove_emojis(text)
    text = remove_nums(text)
    text = remove_punctuations(text)
    text = remove_html_tags(text)
    text = tokenizer.tokenize(text)
    text = remove_stopwords(text)
    text = word_lemmatizer(text)
    
    cleaned_text.append(text)
    
    return cleaned_text

In [29]:
dftext = df['tweet']
dftext

68092      had that nightmare again where i wake up and m...
1336530                       work bleh ha come visit today 
423587     School is finally over!!! :] Yet....its upsett...
1197113    @YoungQ Hope you're not missing Va Bch, DC, Jo...
662177     I'm sad because my best friend Brittany is not...
                                 ...                        
119448     @ChubbyGayMan I am good, just kinda tired. Try...
304177     @millertaylor i wish i could but i don't have ...
42082            Sending all thoughts and prayers to Layla. 
761509           is getting annoyed with always being bored 
1015627    Good morning tweeps, been real busy with work ...
Name: tweet, Length: 1599999, dtype: object

In [30]:
cleaned_tweets = clean_text(dftext)

In [31]:
df['cleaned_tweet'] = cleaned_tweets

In [32]:
df.head()

Unnamed: 0,polarity,tweet_id,date,query,user,tweet,sentiment,cleaned_tweet
68092,0,1692674612,Sun May 03 20:31:16 PDT 2009,NO_QUERY,lizoutline,had that nightmare again where i wake up and m...,0,nightmare wake ceiling covered spider run get ...
1336530,4,2017267875,Wed Jun 03 08:07:09 PDT 2009,NO_QUERY,helloimwee,work bleh ha come visit today,1,work bleh ha come visit today
423587,0,2062906222,Sun Jun 07 00:41:55 PDT 2009,NO_QUERY,pempin96,School is finally over!!! :] Yet....its upsett...,0,school finally yetits upsetting
1197113,4,1984916790,Sun May 31 16:03:48 PDT 2009,NO_QUERY,KSuds1313,"@YoungQ Hope you're not missing Va Bch, DC, Jo...",1,hope youre missing va bch dc jones bch pnc sayin
662177,0,2243513264,Fri Jun 19 13:32:41 PDT 2009,NO_QUERY,KatyJW,I'm sad because my best friend Brittany is not...,0,im sad best friend brittany coming tomorrow


***
### Topic Modelling Using NMF

In [37]:
X = df['cleaned_tweet']
y = df['sentiment']

In [40]:
#Reference: Lab 5
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print("\n")
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])
            print("\n")

In [41]:
documents = list(X)
print(len(documents))

1599999


In [42]:
#TFIDF model using NMF
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()



In [43]:
no_topics = 10

In [44]:
#Run NMF
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_



In [45]:
no_top_words = 20
no_top_documents = 8
print("NMF Topics \n\n")
display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)

NMF Topics 


Topic 0:
day today happy great mother last school another beautiful long nice birthday tomorrow bad everyone hope first sunny mom father


day


day


r u day


wqtching e day


day


day


day


wolfvegas day


Topic 1:
good morning night luck everyone thats hope sound feel feeling thing time today world pretty afternoon monday weekend idea say


good night good morning


good morning good night


good night good morning


good morning good night


good morning good night


good night good morning


good morning good night


good morning good night


Topic 2:
thanks follow following followfriday much great ff hey haha guy ill lol awesome aww lot ok link appreciate cool sharing


thanks


thanks


thanks


thanks steeeeeeeeeeee


thanks


thanks


thanks


thanks


Topic 3:
im sorry going tired gonna sad bored sick sure still getting think happy right excited glad bed hungry hear home


im paaaaaaaaaaaaaaain


im


im downnnnnlt fuckyeaaaaa


im fandiddlytastic


im


im


***
### Classifying whether a user's tweet has a negative or positive sentiment

In [46]:
tfidf_n = TfidfVectorizer(ngram_range=(1,2))

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [48]:
tfidf_n.fit(X_train)

TfidfVectorizer(ngram_range=(1, 2))

In [49]:
X_train = tfidf_n.transform(X_train)
X_test  = tfidf_n.transform(X_test)

In [50]:
def assess(clf_model):
    
    y_pred = clf_model.predict(X_test)
    recall = recall_score(y_test,y_pred) #Calculate the recall score between actual predictions and model predictions
    accuracy = accuracy_score(y_test,y_pred)
    
    print(classification_report(y_test, y_pred))
    print ('Model Recall: {}'.format(recall))
    print ('Model Accuracy: {}'.format(accuracy))

In [51]:
svc_model = LinearSVC().fit(X_train, y_train)

In [52]:
w = tfidf_n.get_feature_names()
coef = svc_model.coef_.tolist()[0]
coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
print('')
print('-Top 20 positive-')
print(coeff_df.head(20).to_string(index=False))
print('')
print('-Top 20 negative-')        
print(coeff_df.tail(20).to_string(index=False))




-Top 20 positive-
             Word  Coefficient
        cant wait     5.331313
      cannot wait     4.055507
      doesnt hurt     3.466264
wont disappointed     3.353734
        wish luck     3.242977
       get enough     3.077121
    nothing wrong     2.919350
     sick anymore     2.899433
        canâ wait     2.878087
         cant bad     2.729425
         aint bad     2.701641
            smile     2.701144
      wont regret     2.688319
        cant hurt     2.642435
        long lost     2.616629
      sorry delay     2.593132
      fair enough     2.583259
        wont hurt     2.573557
          smiling     2.562924
      sad anymore     2.556459

-Top 20 negative-
           Word  Coefficient
        saddest    -3.496384
       headache    -3.579155
inaperfectworld    -3.613851
         bummer    -3.634492
  unfortunately    -3.727567
         ruined    -3.756016
  disappointing    -3.756258
           cant    -3.819888
    passed away    -3.884650
           lost    -3

In [53]:
assess(svc_model)

              precision    recall  f1-score   support

           0       0.79      0.79      0.79    199768
           1       0.79      0.79      0.79    200232

    accuracy                           0.79    400000
   macro avg       0.79      0.79      0.79    400000
weighted avg       0.79      0.79      0.79    400000

Model Recall: 0.7929302009668784
Model Accuracy: 0.7905675


In [54]:
lgr_model = LogisticRegression().fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
w = tfidf_n.get_feature_names()
coef = lgr_model.coef_.tolist()[0]
coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
print('')
print('-Top 20 positive-')
print(coeff_df.head(20).to_string(index=False))
print('')
print('-Top 20 negative-')        
print(coeff_df.tail(20).to_string(index=False))




-Top 20 positive-
          Word  Coefficient
     cant wait    12.683616
   cannot wait    11.111315
        thanks     9.738492
     wish luck     8.841132
       smiling     7.953867
congratulation     7.926818
          glad     7.546843
     wasnt bad     7.534480
     dont miss     7.394682
       excited     7.279660
 nothing wrong     7.226360
    get enough     7.179144
   dont forget     6.974525
      isnt bad     6.974214
       amazing     6.962698
         thank     6.947529
       welcome     6.528431
           yay     6.458659
  followfriday     6.349990
      made day     6.346258

-Top 20 negative-
         Word  Coefficient
disappointing    -9.631143
         suck    -9.730674
       bummer    -9.731106
unfortunately   -10.032774
         hurt   -10.121871
       ruined   -10.462240
      missing   -10.516198
         died   -10.559800
    cancelled   -10.600008
   depressing   -10.668919
 disappointed   -10.959266
       gutted   -10.962342
         lost   -11.008

In [56]:
assess(lgr_model)

              precision    recall  f1-score   support

           0       0.80      0.79      0.80    199768
           1       0.79      0.80      0.80    200232

    accuracy                           0.80    400000
   macro avg       0.80      0.80      0.80    400000
weighted avg       0.80      0.80      0.80    400000

Model Recall: 0.8048313955811259
Model Accuracy: 0.7977575


In [57]:
def predict_sentiment(tfidf_n, model, text):
    tweet = tfidf_n.transform(clean(text))
    sentiment = model.predict(tweet)
    posp = "Positive Sentiment"
    negp = "Negative Sentiment"
    
    if sentiment == 0:
        return negp
    if sentiment == 1:
        return posp

In [58]:
#Enter your tweet to predict the sentiment:
tweet = ["I hate puppies"]
predict_sentiment(tfidf_n, svc_model, tweet)

'Negative Sentiment'

In [59]:
#Enter your tweet to predict the sentiment:
tweet = ["I love puppies"]
predict_sentiment(tfidf_n, svc_model, tweet)

'Positive Sentiment'

In [60]:
#Enter your tweet to predict the sentiment:
tweet = ["I hate puppies"]
predict_sentiment(tfidf_n, lgr_model, tweet)

'Negative Sentiment'

In [61]:
#Enter your tweet to predict the sentiment:
tweet = ["I love puppies"]
predict_sentiment(tfidf_n, lgr_model, tweet)

'Positive Sentiment'

References:

* Lab 5
* https://dylancastillo.co/nlp-snippets-clean-and-tokenize-text-with-python/#transform-emojis-to-characters
* https://machinelearningmastery.com/how-to-connect-model-input-data-with-predictions-for-machine-learning/
* https://www.academia.edu/74585014/Machine_Learning_Approach_to_Sentiment_Analysis_from_Movie_Reviews_Using_Word2Vec
* https://entertainment.bacsigan.com/popedaze/sentiment-analysis-in-python
* https://www.w3schools.com/python/python_lists_add.asp
* https://stackoverflow.com/questions/32106063/sklearn-linearsvc-x-has-1-features-per-sample-expecting-5
* https://datascience.stackexchange.com/questions/51224/why-does-transform-from-tfidf-vectorizer-sklearn-not-work