#  Basic Sentiment Analysis using Naive Bayes classifier on Movie Reviews Dataset

In [4]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.7.24-cp39-cp39-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     ------------------ ------------------- 20.5/41.5 kB 640.0 kB/s eta 0:00:01
     -------------------------------------- 41.5/41.5 kB 666.1 kB/s eta 0:00:00
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ----- ---------------------------------- 0.2/1.5 MB 5.9 MB/s eta 0:00:01
   -------------- ------------------------- 0.5/1.5 MB 6.7 MB/s eta 0:00:01
   ---------------------------------------  1.5/1.5 MB 13.7 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 13.7 MB/s eta 0:00:00
Downloading regex-2024.7.24-cp39-cp39-win_amd64.whl (269 kB)
   ----------------



In [5]:
import nltk
import random
from nltk.corpus import movie_reviews

In [10]:
from nltk.classify import ClassifierI
from statistics import mode
import numpy as np
from nltk.tokenize import word_tokenize
import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abidm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [11]:
short_pos = open("short_reviews/positive.txt",encoding="latin-1").read()
short_neg = open("short_reviews/negative.txt",encoding="latin-1").read()

documents = []

#get all lines from positive as well as negative docs
for r in short_pos.split('\n'):
    documents.append( (r, "pos") )

for r in short_neg.split('\n'):
    documents.append( (r, "neg") )

#get all words
all_words = []

short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

for w in short_pos_words:
    all_words.append(w.lower())

for w in short_neg_words:
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

for i in np.arange(10):
    print(list(all_words.keys())[i:i+1], list(all_words.values())[i:i+1])
       
#chooose to use first 5000 words as features for our purpose.
word_features = list(all_words.keys())[:5000]
word_features[:10]

def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

random.shuffle(featuresets)

['the'] [10113]
['rock'] [34]
['is'] [3559]
['destined'] [8]
['to'] [4234]
['be'] [939]
['21st'] [6]
['century'] [18]
["'s"] [3537]
['new'] [206]


In [12]:
len(featuresets)

10664

In [13]:
training_set = featuresets[:5332]
testing_set =  featuresets[5332:]

In [19]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)



Original Naive Bayes Algo accuracy percent: 72.03675918979745
Most Informative Features
                   tries = True              neg : pos    =     15.3 : 1.0
                provides = True              pos : neg    =     11.9 : 1.0
                    warm = True              pos : neg    =     11.9 : 1.0
              unexpected = True              pos : neg    =     11.2 : 1.0
                   bland = True              neg : pos    =     10.1 : 1.0
             pretentious = True              neg : pos    =     10.1 : 1.0
                captures = True              pos : neg    =      9.9 : 1.0
               inventive = True              pos : neg    =      9.9 : 1.0
                powerful = True              pos : neg    =      9.6 : 1.0
                  school = True              neg : pos    =      8.8 : 1.0
                touching = True              pos : neg    =      8.8 : 1.0
                  moving = True              pos : neg    =      8.7 : 1.0
            

In [15]:
def sentiment(text):
    feats = find_features(text)
    pred=classifier.classify(feats)
    return pred, classifier.prob_classify(feats).prob(pred)

In [18]:
print(sentiment("This movie was good and awesome. The acting was great, plot was wonderful, and amazing"))

('pos', 0.943961573059633)


In [17]:
print(sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))

('neg', 0.9986464663726083)
