# USE CASE : SENTIMENT ANALYSIS

We have been given a set of movie reviews split into training and test data with predicted sentiments for the reviews. We build a model using the Naive Bayes Classifier on the training data to predict the sentiments of the reviews. We then check the accuracy of the model by predicting the sentiments of the test data. The model gives an accuracy of 74 %.

In [1]:
# Import Libraries

In [2]:
import pandas as pd
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify import SklearnClassifier
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import wordnet

In [3]:
# Import training data

In [4]:
train = pd.read_excel('training_movie.xlsx')

In [5]:
train.head()

Unnamed: 0,Text,Sentiment
0,over rated and the people cant act worth crap....,neg
1,Not Everybody loves Raymond. The only joke he...,neg
2,"Sorry, I\'ve given GA several chances and it\'...",neg
3,David Caruso is just not good. He jut seems l...,neg
4,"good show, but takes way too manybreaks!!!!!!!...",neg


In [6]:
len(train)

386

In [7]:
# Import test data

In [8]:
test = pd.read_excel('test_movie.xlsx')

In [9]:
test.head()

Unnamed: 0,Text,Sentiment
0,"It had some good moments but its over, let it go.",neg
1,"RAY ROMANO IS NOT FUNNY, PLEASE DONT ENCOURAGE...",neg
2,Went from fun and exciting to - I just deleted...,neg
3,blueprints on his arms come on people!,neg
4,A pale imitation of the original British serie...,neg


In [10]:
len(test)

81

In [11]:
# Segregate positive and negative reviews fromm train data

In [12]:
train_pos = train[train['Sentiment'] == 'pos']
train_pos = train_pos['Text']

In [13]:
train_neg = train[train['Sentiment'] == 'neg']
train_neg = train_neg['Text']

In [14]:
''' def wordcloud_draw(data, color = 'black'):
    words = ' '.join(data)
    cleaned_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and not word.startswith('#')
                                and word != 'RT'
                            ])
    wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color=color,
                      width=2500,
                      height=2000
                     ).generate(cleaned_word)
    plt.figure(1,figsize=(13, 13))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
    
print("Positive words")
wordcloud_draw(train_pos1,'white')
print("Negative words")
wordcloud_draw(train_neg1) '''

' def wordcloud_draw(data, color = \'black\'):\n    words = \' \'.join(data)\n    cleaned_word = " ".join([word for word in words.split()\n                            if \'http\' not in word\n                                and not word.startswith(\'@\')\n                                and not word.startswith(\'#\')\n                                and word != \'RT\'\n                            ])\n    wordcloud = WordCloud(stopwords=STOPWORDS,\n                      background_color=color,\n                      width=2500,\n                      height=2000\n                     ).generate(cleaned_word)\n    plt.figure(1,figsize=(13, 13))\n    plt.imshow(wordcloud)\n    plt.axis(\'off\')\n    plt.show()\n    \nprint("Positive words")\nwordcloud_draw(train_pos1,\'white\')\nprint("Negative words")\nwordcloud_draw(train_neg1) '

In [15]:
train_pos.head()

194    While the networks continue to produce mindles...
195    Really enjoy this show, but it seems to never ...
196    James Spader\'s and WIlliam Shatner\'s charach...
197    Whaaaat  How can Friends not be cherished  One...
198    This is one of the better shows on TV. Absolut...
Name: Text, dtype: object

In [16]:
stop_words = stopwords.words('english')

In [17]:
# Preprocess data : covert to lowercase, remove punctuation, remove stop words

In [18]:
import re
def preprocess(data):
    reviews_tokens = []
    for review in data:
        review = review.lower() #Convert to lower-case words
        raw_word_tokens = re.findall(r'(?:\w+)', review,flags = re.UNICODE) #remove pontuaction
        word_tokens = [w for w in raw_word_tokens if not w in stop_words] # do not add stop words
        reviews_tokens.append(word_tokens)
    return reviews_tokens #return all tokens

In [19]:
l1 = preprocess(train_pos)

In [20]:
l2 = preprocess(train_neg)

In [21]:
# Segregate positive and negative reviews from test data

In [22]:
test_pos = test[test['Sentiment'] == 'pos']
test_pos = test_pos['Text']

In [23]:
test_neg = test[test['Sentiment'] == 'neg']
test_neg = test_neg['Text']

In [24]:
l4 = preprocess(test_neg)

In [25]:
l3 = preprocess(test_pos)

In [26]:
# Create dictionary for Naive Bayes Classifier

In [27]:
def create_word_features(words):
    useful_words = [word for word in words if word not in stopwords.words("english")]
    my_dict = dict([(word, True) for word in useful_words])
    return my_dict

In [28]:
create_word_features(l1[0])

{'america': True,
 'cold': True,
 'continue': True,
 'drivel': True,
 'far': True,
 'funniest': True,
 'get': True,
 'got': True,
 'intelligent': True,
 'like': True,
 'mindless': True,
 'networks': True,
 'oc': True,
 'produce': True,
 'right': True,
 'scrubs': True,
 'shame': True,
 'shoulder': True,
 'show': True,
 'shows': True,
 'smartest': True,
 'talent': True,
 'television': True}

In [29]:
# Create dictionaries for positive and negative reviews (train and test data)

In [30]:
pos_reviews = []
for i in range(0,len(l1)):
    pos_reviews.append((create_word_features(l1[i]), "positive")) 
print(len(pos_reviews))

192


In [31]:
neg_reviews = []
for j in range(0,len(l2)):
    neg_reviews.append((create_word_features(l2[j]), "negative")) 
print(len(neg_reviews))

194


In [32]:
pos_reviewst = []
for k in range(0,len(l3)):
    pos_reviewst.append((create_word_features(l3[k]), "positive")) 
print(len(pos_reviewst))

41


In [33]:
neg_reviewst = []
for l in range(0,len(l4)):
    neg_reviewst.append((create_word_features(l4[l]), "negative")) 
print(len(neg_reviewst))

40


In [34]:
# Create train and test set

In [35]:
train_set = neg_reviews[:] + pos_reviews[:]
test_set =  neg_reviewst[:] + pos_reviewst[:]
print(len(train_set),  len(test_set))

386 81


In [36]:
# Instantiate Naive Bayes Classifier

In [37]:
classifier = NaiveBayesClassifier.train(train_set)

In [38]:
# Calculate accuracy Naive Bayes

In [39]:
accuracy = nltk.classify.util.accuracy(classifier, test_set)
print(accuracy * 100)

74.07407407407408


In [40]:
# Most important features

In [41]:
classifier.show_most_informative_features(15)

Most Informative Features
                    best = True           positi : negati =      7.7 : 1.0
                 awesome = True           positi : negati =      7.1 : 1.0
                    used = True           negati : positi =      6.9 : 1.0
                favorite = True           positi : negati =      6.4 : 1.0
                  stupid = True           negati : positi =      6.3 : 1.0
                    wait = True           positi : negati =      5.7 : 1.0
                    dumb = True           negati : positi =      5.6 : 1.0
                     csi = True           negati : positi =      5.6 : 1.0
               excellent = True           positi : negati =      5.5 : 1.0
                 someone = True           positi : negati =      5.1 : 1.0
                  second = True           positi : negati =      5.1 : 1.0
                 finally = True           positi : negati =      5.1 : 1.0
            entertaining = True           positi : negati =      5.1 : 1.0

In [42]:
# Classifier Labels

In [43]:
classifier.labels()

['negative', 'positive']

In [44]:
# Classifier Example

In [45]:
classifier.classify({"This":True,"is":True,"wonderful":True})

'positive'

In [46]:
classifier.classify({"This":True,"is":True,"bad":True})

'negative'