# Sentiment Analysis

This project deals with the analysis of sentiment of tweets. It has been achieved by training Multinomial Naive Bayes and Linear Support Vector Classifier models. We have achieved accuracy scores of 74% and 71% for the models respectively.

### 1. Importing Libraries

In [2]:
import pandas
import matplotlib.pyplot as plt
import string

import re
from string import punctuation
from collections import Counter

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### 2. Reading the csv file and converting into Pandas Dataframe

In [3]:
tweets = pandas.read_csv('tweets_sentiment.csv',  encoding='ISO8859-1')
tweetsdf = pandas.DataFrame(tweets['Sentiment'], columns=['Sentiment'])
tweetsdf['Sentiment Text'] = tweets['SentimentText']
tweetsdf

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Sentiment,Sentiment Text
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...
5,0,or i just worry too much?
6,1,Juuuuuuuuuuuuuuuuussssst Chillin!!
7,0,Sunny Again Work Tomorrow :-| ...
8,1,handed in my uniform today . i miss you ...
9,1,hmmmm.... i wonder how she my number @-)


### 3. Cleaning up Sentiment Text

In [4]:
# helper function to clean tweets
def processTweet(tweet):
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','',tweet)
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # To lowercase
    tweet = tweet.lower()
    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    # Remove Punctuation and split 's, 't, 've with a space for filter
    tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet)
    # Remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ') 
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    tweet = ''.join(c for c in tweet if c <= '\uFFFF') 
    return tweet
# ______________________________________________________________
# clean dataframe's text column
tweetsdf['Sentiment Text'] = tweetsdf['Sentiment Text'].apply(processTweet)
# preview some cleaned tweets
tweetsdf

Unnamed: 0,Sentiment,Sentiment Text
0,0,sad for apl friend
1,0,missed the new moon trailer
2,1,omg its already
3,0,omgaga sooo gunna cry been this dentist since ...
4,0,think cheating
5,0,just worry too much
6,1,juuuuuuuuuuuuuuuuussssst chillin
7,0,sunny again work tomorrow tonight
8,1,handed uniform today miss you already
9,1,hmmmm wonder how she number


### 4. Grouping Tweets by Sentiment

In [5]:
# check the number of positive vs. negative tagged sentences
positives = tweetsdf['Sentiment'][tweetsdf.Sentiment == 0]
negatives = tweetsdf['Sentiment'][tweetsdf.Sentiment == 1]


print('number of positve tagged sentences is:  {}'.format(len(positives)))
print('number of negative tagged sentences is: {}'.format(len(negatives)))
print('total length of the data is:            {}'.format(tweetsdf.shape[0]))

number of positve tagged sentences is:  494105
number of negative tagged sentences is: 554470
total length of the data is:            1048575


In [6]:
# get unique label counts
tweetsdf.groupby('Sentiment').describe()

Unnamed: 0_level_0,Sentiment Text,Sentiment Text,Sentiment Text,Sentiment Text
Unnamed: 0_level_1,count,unique,top,freq
Sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,494105,474344,,1813
1,554470,521254,,2076


### 5. Counting no. of words in each tweet

In [7]:
# get a word count per sentence column
def count_no_of_words(sentence):
    return len(sentence.split())
    
tweetsdf['word count'] = tweetsdf['Sentiment Text'].apply(count_no_of_words)
tweetsdf

Unnamed: 0,Sentiment,Sentiment Text,word count
0,0,sad for apl friend,4
1,0,missed the new moon trailer,5
2,1,omg its already,3
3,0,omgaga sooo gunna cry been this dentist since ...,15
4,0,think cheating,2
5,0,just worry too much,4
6,1,juuuuuuuuuuuuuuuuussssst chillin,2
7,0,sunny again work tomorrow tonight,5
8,1,handed uniform today miss you already,6
9,1,hmmmm wonder how she number,5


In [8]:
# get most common words in training dataset
all_words = []
for line in list(tweetsdf['Sentiment Text']):
    words = line.split()
    for word in words:
        all_words.append(word.lower())
    
    
Counter(all_words).most_common(10)


[('the', 331194),
 ('you', 243592),
 ('and', 194491),
 ('for', 138131),
 ('that', 122362),
 ('have', 102096),
 ('but', 92359),
 ('just', 88702),
 ('with', 71530),
 ('not', 70742)]

In [9]:
# drop duplicates
tweetsdf = tweetsdf.drop_duplicates('Sentiment Text')
tweetsdf.shape

(993412, 3)

### 6. Removing stopwords and Tokenization

In [10]:
# show stop words examples
stop_words = ['i' , 'me' , 'my' , 'myself' , 'we' , 'our' , 'ours' , 'ourselves' , 'you' , 'you\'re' , 'you\'ve' , 'you\'ll' , 'you\'d' , 'your' , 'yours' , 'yourself' , 'yourselves' , 'he' , 'him' , 'his' , 'himself' , 'she' , 'her' , 'hers' , 'herself' , 'it']



In [11]:
# tokenize helper function
def text_process(raw_text):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in list(raw_text) if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.lower().split() if word.lower() not in stop_words]


# tokenize message column and create a column for tokens
tweetsdf['tokens'] = tweetsdf['Sentiment Text'].apply(text_process) # tokenize style 1

tweetsdf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Sentiment,Sentiment Text,word count,tokens
0,0,sad for apl friend,4,"[sad, for, apl, friend]"
1,0,missed the new moon trailer,5,"[missed, the, new, moon, trailer]"
2,1,omg its already,3,"[omg, its, already]"
3,0,omgaga sooo gunna cry been this dentist since ...,15,"[omgaga, sooo, gunna, cry, been, this, dentist..."
4,0,think cheating,2,"[think, cheating]"
5,0,just worry too much,4,"[just, worry, too, much]"
6,1,juuuuuuuuuuuuuuuuussssst chillin,2,"[juuuuuuuuuuuuuuuuussssst, chillin]"
7,0,sunny again work tomorrow tonight,5,"[sunny, again, work, tomorrow, tonight]"
8,1,handed uniform today miss you already,6,"[handed, uniform, today, miss, already]"
9,1,hmmmm wonder how she number,5,"[hmmmm, wonder, how, number]"


In [12]:
# split sentences to get individual words
all_words = []
for line in tweetsdf['tokens']: # try 'tokens'
    all_words.extend(line)
    
# create a word frequency dictionary
wordfreq = Counter(all_words)
wordfreq

Counter({'sad': 17309,
         'for': 135410,
         'apl': 2,
         'friend': 7401,
         'missed': 7346,
         'the': 326745,
         'new': 24654,
         'moon': 1515,
         'trailer': 772,
         'omg': 5762,
         'its': 29511,
         'already': 9740,
         'omgaga': 1,
         'sooo': 3513,
         'gunna': 749,
         'cry': 2834,
         'been': 23350,
         'this': 54694,
         'dentist': 760,
         'since': 6289,
         'was': 69490,
         'suposed': 17,
         'just': 88010,
         'get': 53522,
         'crown': 97,
         'put': 5505,
         '30mins': 54,
         'think': 29263,
         'cheating': 146,
         'worry': 1919,
         'too': 46495,
         'much': 23860,
         'juuuuuuuuuuuuuuuuussssst': 1,
         'chillin': 1145,
         'sunny': 2508,
         'again': 18106,
         'work': 36342,
         'tomorrow': 19096,
         'tonight': 14638,
         'handed': 156,
         'uniform': 106,
     

### 7. Vectorization and Transformation of Count Matrix

In [13]:
# vetorize
bow_transformer = CountVectorizer(analyzer=text_process).fit(tweetsdf['Sentiment Text'])
# print total number of vocab words
print(len(bow_transformer.vocabulary_))

216196


In [14]:
#entire word vocabulary 
bow_transformer.vocabulary_

{'sad': 161838,
 'for': 72879,
 'apl': 16572,
 'friend': 74444,
 'missed': 124832,
 'the': 186192,
 'new': 132287,
 'moon': 126642,
 'trailer': 190828,
 'omg': 137628,
 'its': 98856,
 'already': 13746,
 'omgaga': 137635,
 'sooo': 174033,
 'gunna': 83504,
 'cry': 49027,
 'been': 24962,
 'this': 187042,
 'dentist': 53687,
 'since': 170146,
 'was': 202512,
 'suposed': 180897,
 'just': 103191,
 'get': 78249,
 'crown': 48815,
 'put': 151901,
 '30mins': 3319,
 'think': 186953,
 'cheating': 39682,
 'worry': 207952,
 'too': 189792,
 'much': 128200,
 'juuuuuuuuuuuuuuuuussssst': 103310,
 'chillin': 40630,
 'sunny': 180502,
 'again': 11267,
 'work': 207698,
 'tomorrow': 189538,
 'tonight': 189687,
 'handed': 85347,
 'uniform': 196749,
 'today': 189090,
 'miss': 124808,
 'hmmmm': 89750,
 'wonder': 207097,
 'how': 91759,
 'number': 135440,
 'must': 129082,
 'about': 9222,
 'positive': 148227,
 'thanks': 185953,
 'all': 13264,
 'haters': 86365,
 'face': 67574,
 'day': 51772,
 '112': 639,
 '102': 426

In [15]:
# transform the entire DataFrame of messages
sentiment_text_bow = bow_transformer.transform(tweetsdf['Sentiment Text'])

# check out the bag-of-words counts for the entire corpus as a large sparse matrix
print('Shape of Sparse Matrix: ', sentiment_text_bow.shape)
print('Amount of Non-Zero occurences: ', sentiment_text_bow.nnz)

Shape of Sparse Matrix:  (993412, 216196)
Amount of Non-Zero occurences:  8964919


In [16]:
# from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(sentiment_text_bow)

# to transform the entire bag-of-words corpus
sentiment_text_tfidf = tfidf_transformer.transform(sentiment_text_bow)
print(sentiment_text_tfidf.shape)

(993412, 216196)


## Multinomial Naive Bayes

In [16]:
# Run Train Data Through Pipeline analyzer=text_process
# uncomment below to train on a larger dataset but it is very slow for a regular laptop

X_train, X_test, y_train, y_test = train_test_split(tweetsdf['Sentiment Text'], tweetsdf['Sentiment'], test_size=0.2)



# create pipeline
pipeline = Pipeline([
    ('bow', CountVectorizer(strip_accents='ascii',
                            stop_words='english',
                            lowercase=True)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

# this is where we define the values for GridSearchCV to iterate over
parameters = {'bow__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'classifier__alpha': (1e-2, 1e-3),
             }

# do 2-fold cross validation for each of the 8 possible combinations of the above params
grid = GridSearchCV(pipeline, cv=2, param_grid=parameters, verbose=1)
grid.fit(X_train,y_train)

# summarize results
print("\nBest Model: %f using %s" % (grid.best_score_, grid.best_params_))
print('\n')
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("Mean: %f Stdev:(%f) with: %r" % (mean, stdev, param))

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:  5.6min finished



Best Model: 0.742639 using {'bow__ngram_range': (1, 1), 'classifier__alpha': 0.01, 'tfidf__use_idf': False}


Mean: 0.730293 Stdev:(0.000219) with: {'bow__ngram_range': (1, 1), 'classifier__alpha': 0.01, 'tfidf__use_idf': True}
Mean: 0.742639 Stdev:(0.000144) with: {'bow__ngram_range': (1, 1), 'classifier__alpha': 0.01, 'tfidf__use_idf': False}
Mean: 0.729915 Stdev:(0.000217) with: {'bow__ngram_range': (1, 1), 'classifier__alpha': 0.001, 'tfidf__use_idf': True}
Mean: 0.739953 Stdev:(0.000127) with: {'bow__ngram_range': (1, 1), 'classifier__alpha': 0.001, 'tfidf__use_idf': False}
Mean: 0.715138 Stdev:(0.000265) with: {'bow__ngram_range': (1, 2), 'classifier__alpha': 0.01, 'tfidf__use_idf': True}
Mean: 0.731885 Stdev:(0.000216) with: {'bow__ngram_range': (1, 2), 'classifier__alpha': 0.01, 'tfidf__use_idf': False}
Mean: 0.707476 Stdev:(0.000324) with: {'bow__ngram_range': (1, 2), 'classifier__alpha': 0.001, 'tfidf__use_idf': True}
Mean: 0.719033 Stdev:(0.000158) with: {'bow__ngram_range'

In [17]:
# save best model to current working directory
joblib.dump(grid, "twitter_sentiment.pkl")

['twitter_sentiment.pkl']

#### Accuracy Score and Confusion Matrix

In [18]:
# load from file and predict using the best configs found in the CV step
model_NB = joblib.load("twitter_sentiment.pkl" )

# get predictions from best model above
y_preds = model_NB.predict(X_test)

print('accuracy score: ',accuracy_score(y_test, y_preds))
print('\n')
print('confusion matrix: \n',confusion_matrix(y_test,y_preds))
print('\n')
print(classification_report(y_test, y_preds))

accuracy score:  0.7470040214814554


confusion matrix: 
 [[67254 27325]
 [22941 81163]]


             precision    recall  f1-score   support

          0       0.75      0.71      0.73     94579
          1       0.75      0.78      0.76    104104

avg / total       0.75      0.75      0.75    198683



## Linear Support Vector Classifier

In [17]:
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(tweetsdf['Sentiment Text'][:20000], tweetsdf['Sentiment'][:20000], test_size=0.2)
# X_train, X_test, y_train, y_test = train_test_split(tweetsdf['Sentiment Text'], tweetsdf['Sentiment'], test_size=0.2)      # Extremely slow with the complete dataset
text_clf_SVC = Pipeline([('bow', CountVectorizer(strip_accents='ascii',stop_words='english',lowercase=True)),
                     ('tfidf', TfidfTransformer()),
                     ('classifier', SVC(kernel='linear')),
])
text_clf_SVC.fit(X_train, y_train)
print('Done!')

Done!


#### Accuracy Score and Confusion Matrix

In [18]:
# get predictions from best model above
y_preds_SVC = text_clf_SVC.predict(X_test)

print('accuracy score: ',accuracy_score(y_test, y_preds_SVC))
print('\n')
print('confusion matrix: \n',confusion_matrix(y_test,y_preds_SVC))
print('\n')
print(classification_report(y_test, y_preds_SVC))

accuracy score:  0.7105


confusion matrix: 
 [[1540  558]
 [ 600 1302]]


             precision    recall  f1-score   support

          0       0.72      0.73      0.73      2098
          1       0.70      0.68      0.69      1902

avg / total       0.71      0.71      0.71      4000

