# Sentiment analysis of 1583820 words in movie review


Libraries

In [700]:
import nltk
nltk.download('movie_reviews') 
from nltk.corpus import movie_reviews
import random


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\bibek\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Count the number of words in movie reviews.

In [701]:
movie_reviews.words()


['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [702]:
len(movie_reviews.words())

1583820

Categorize review words according to their sentiments

In [703]:
movie_reviews.categories()


['neg', 'pos']

In [704]:
len(movie_reviews.fileids())

2000

Creates list named documents with each element in the list containing words used in the movie review respective to their categories.

In [705]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]


 Randomly shuffles the list

In [706]:
random.shuffle(documents)

Converts all the words in corpus to lower case

In [707]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

In [708]:
 nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bibek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Downloads stopwords from nltk and updates file

In [709]:
from nltk.corpus import stopwords

In [710]:
stopwords = nltk.corpus.stopwords.words('english')

In [711]:
import string

Removes all stop words and punctuations and updates it in all_words

In [712]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words() if w not in stopwords if w not in string.punctuation)

Creates a list named “word_features” with 2000 most frequent words

In [714]:
word_features = sorted(list(all_words.keys()),key=all_words.get,reverse=True)[:2000]


Defines a document_features function that shows whether each review file contains any of the 2000 most frequent words. 

In [715]:
def document_features(document): 
    document_words = set(document) 
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

Applies the function to each element in the list documents and saves it under featuresets


In [716]:
featuresets = [(document_features(d), c) for (d,c) in documents]

In [717]:
len(featuresets) 

2000

Splits “featuresets” into 100 test_set and 1900 train_set 

In [718]:
train_set, test_set = featuresets[1900:], featuresets[:100]

Applies nltk’s NaiveBayes Classifier to the train_set.


In [719]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

Displays Accuracy of test set

In [721]:
print(nltk.classify.accuracy(classifier, test_set))

0.78


Shows the top 15 most informative features for movie_reviews

In [722]:
classifier.show_most_informative_features(15)

Most Informative Features
         contains(worst) = True              neg : pos    =      6.7 : 1.0
        contains(change) = True              pos : neg    =      6.6 : 1.0
        contains(camera) = True              neg : pos    =      6.1 : 1.0
       contains(perfect) = True              pos : neg    =      5.9 : 1.0
          contains(rock) = True              pos : neg    =      5.9 : 1.0
         contains(class) = True              pos : neg    =      5.9 : 1.0
      contains(supposed) = True              neg : pos    =      5.4 : 1.0
    contains(considered) = True              neg : pos    =      5.4 : 1.0
        contains(talent) = True              neg : pos    =      5.4 : 1.0
       contains(intense) = True              pos : neg    =      5.2 : 1.0
        contains(unique) = True              pos : neg    =      4.5 : 1.0
         contains(truly) = True              pos : neg    =      4.5 : 1.0
       contains(towards) = True              pos : neg    =      4.5 : 1.0

Twenty fold cross validation to show how the Bayes’ classifier performs over different subsets of “featuresets.”

In [723]:
accu=0

for i in range(1, 21):
    print(i)
    train_set, test_set = featuresets[:100*(i-1)]+featuresets[100*i:], featuresets[100*(i-1):100*i]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    arate=nltk.classify.accuracy(classifier, test_set)
    accu=accu+arate
    print('accuracy',arate)



1
accuracy 0.79
2
accuracy 0.81
3
accuracy 0.8
4
accuracy 0.78
5
accuracy 0.77
6
accuracy 0.84
7
accuracy 0.88
8
accuracy 0.86
9
accuracy 0.88
10
accuracy 0.83
11
accuracy 0.77
12
accuracy 0.83
13
accuracy 0.79
14
accuracy 0.82
15
accuracy 0.77
16
accuracy 0.82
17
accuracy 0.76
18
accuracy 0.77
19
accuracy 0.74
20
accuracy 0.8


Prints the overall prediction accuracy based on the average of the 20 accuracy rates

In [724]:
print(accu/i)

0.8055
