# NLP of Movie Reviews using nltk (Sentiment Analysis)

### Importing important module

In [0]:
import nltk
import string
import matplotlib.pyplot as plt
%matplotlib inline

### Download important dataset

In [2]:
nltk.download("movie_reviews")
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
from nltk.corpus import movie_reviews

In [4]:
print(len(movie_reviews.fileids()))
movie_reviews.fileids()[:5]

2000


['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [5]:
movie_reviews.fileids()[-5:]

['pos/cv995_21821.txt',
 'pos/cv996_11592.txt',
 'pos/cv997_5046.txt',
 'pos/cv998_14111.txt',
 'pos/cv999_13106.txt']

In [0]:
negative_fileids = movie_reviews.fileids('neg')
positive_fileids = movie_reviews.fileids('pos')

In [7]:
print(dir(movie_reviews)) # functions in movie_reviews

['CorpusView', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '__weakref__', '_add', '_c2f', '_delimiter', '_encoding', '_f2c', '_file', '_fileids', '_get_root', '_init', '_map', '_para_block_reader', '_pattern', '_read_para_block', '_read_sent_block', '_read_word_block', '_resolve', '_root', '_sent_tokenizer', '_tagset', '_unload', '_word_tokenizer', 'abspath', 'abspaths', 'categories', 'citation', 'encoding', 'ensure_loaded', 'fileids', 'license', 'open', 'paras', 'raw', 'readme', 'root', 'sents', 'unicode_repr', 'words']


In [8]:
len(negative_fileids), len(positive_fileids)

(1000, 1000)

In [9]:
print(movie_reviews.raw(fileids=positive_fileids[0]))

films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . 
for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . 
to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . 
the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . 
in other words , don't dismiss this film because of its source . 
if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes . 
getting the hughes brothers to direct this seems almost as 

In [10]:
print(movie_reviews.raw(fileids=negative_fileids[0]))

plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
what's the deal ? 
watch the movie and " sorta " find out . . . 
critique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . 
which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn't snag this one correctly . 
they seem to have taken this pretty neat concept , but executed it terribly . 
so what are the problems with the movie ? 
well , its main problem is that it's simply too jumbled . 
it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no id

### Assigning ```useless_words```

In [11]:
useless_words = set(nltk.corpus.stopwords.words("english") + list(string.punctuation))
print(useless_words)

{'same', 'and', 'its', 'was', '\\', 'very', ';', 'against', 'me', 'whom', 'doing', 'won', 'him', 'too', 'does', 'by', 'hers', "wasn't", 'who', 'but', 'weren', 'needn', 'our', 'while', '^', 'there', 'has', "you'll", 'it', 'which', 'myself', 'such', 'now', 'she', 'how', 'yours', 'an', 'll', 'yourselves', 'not', 'have', 'those', 'were', "didn't", 'mustn', 'hasn', 'y', 'if', ']', 'for', '}', 'we', 'just', 'all', "weren't", 'o', 'isn', "mustn't", "won't", '$', "hadn't", '>', 'ourselves', "she's", 've', 'didn', 'both', 't', 'you', 'my', 'only', "isn't", '_', 'from', 'through', '"', '~', 'theirs', 's', 'what', 'm', "wouldn't", 'am', "shan't", 'during', 'this', 'where', '/', 'wasn', 'any', 'hadn', "haven't", 'ours', 'other', 'some', 'mightn', '(', 'the', 'i', 'than', 'is', 'he', '@', 'then', 'should', "couldn't", '-', '&', 'nor', 'doesn', "mightn't", "'", 'or', '!', 'having', 'be', 'her', 'when', 'with', 'these', 'in', 'at', "that'll", "aren't", 'being', 'off', "hasn't", 'until', 'shan', "need

In [0]:
positive_fileid_words = []
for i in range(len(positive_fileids)):
    positive_fileid_words.append(list(word.lower() for word in movie_reviews.words(fileids=positive_fileids[i]) if word not in useless_words))

negative_fileid_words = []
for i in range(len(negative_fileids)):
    negative_fileid_words.append(list(word.lower() for word in movie_reviews.words(fileids=negative_fileids[i]) if word not in useless_words))

In [13]:
print(positive_fileid_words[0])

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', '80s', '12', 'part', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', '500', 'pages', 'long', 'includes', 'nearly', '30', 'consist', 'nothing', 'footnotes', 'words', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hell', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'a

In [14]:
print(negative_fileid_words[0])

['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guys', 'dies', 'girlfriend', 'continues', 'see', 'life', 'nightmares', 'deal', 'watch', 'movie', 'sorta', 'find', 'critique', 'mind', 'fuck', 'movie', 'teen', 'generation', 'touches', 'cool', 'idea', 'presents', 'bad', 'package', 'makes', 'review', 'even', 'harder', 'one', 'write', 'since', 'generally', 'applaud', 'films', 'attempt', 'break', 'mold', 'mess', 'head', 'lost', 'highway', 'memento', 'good', 'bad', 'ways', 'making', 'types', 'films', 'folks', 'snag', 'one', 'correctly', 'seem', 'taken', 'pretty', 'neat', 'concept', 'executed', 'terribly', 'problems', 'movie', 'well', 'main', 'problem', 'simply', 'jumbled', 'starts', 'normal', 'downshifts', 'fantasy', 'world', 'audience', 'member', 'idea', 'going', 'dreams', 'characters', 'coming', 'back', 'dead', 'others', 'look', 'like', 'dead', 'strange', 'apparitions', 'disappearances', 'looooot', 'chase', 'scenes', 'tons', 'weird', '

### Creating Bag-of-Words(= Text as unordered collection of words)

In [0]:
def bag_of_words(words):
    return {word:True for word in words}

In [16]:
all_words = movie_reviews.words()
print(len(all_words)/1e6, 'million')

1.58382 million


In [17]:
filtered_words = [word for word in movie_reviews.words() if word not in useless_words]
print('After filtering:',len(filtered_words)/1e6, 'million')

After filtering: 0.710579 million


### Creating features

In [0]:
positive_features = [(bag_of_words(movie_reviews.words(fileids=[f])), 'pos') for f in positive_fileids]

negative_features = [(bag_of_words(movie_reviews.words(fileids=[f])), 'neg') for f in negative_fileids]

In [19]:
print(positive_features[0], negative_features[0], sep='\n\n\n')

({'films': True, 'adapted': True, 'from': True, 'comic': True, 'books': True, 'have': True, 'had': True, 'plenty': True, 'of': True, 'success': True, ',': True, 'whether': True, 'they': True, "'": True, 're': True, 'about': True, 'superheroes': True, '(': True, 'batman': True, 'superman': True, 'spawn': True, ')': True, 'or': True, 'geared': True, 'toward': True, 'kids': True, 'casper': True, 'the': True, 'arthouse': True, 'crowd': True, 'ghost': True, 'world': True, 'but': True, 'there': True, 's': True, 'never': True, 'really': True, 'been': True, 'a': True, 'book': True, 'like': True, 'hell': True, 'before': True, '.': True, 'for': True, 'starters': True, 'it': True, 'was': True, 'created': True, 'by': True, 'alan': True, 'moore': True, 'and': True, 'eddie': True, 'campbell': True, 'who': True, 'brought': True, 'medium': True, 'to': True, 'whole': True, 'new': True, 'level': True, 'in': True, 'mid': True, '80s': True, 'with': True, '12': True, '-': True, 'part': True, 'series': True

In [20]:
print(len(positive_features), len(negative_features))

1000 1000


In [21]:
split = int(0.80*len(positive_features)) # 80% OF positive_features or negative_features
split

800

### Creating NaiveBayesClassifier model

In [0]:
from nltk.classify import NaiveBayesClassifier
sentiment_classifier = NaiveBayesClassifier.train(positive_features[:split]+negative_features[:split])

### Training accuracy

In [23]:
nltk.classify.util.accuracy(sentiment_classifier, positive_features[:split]+negative_features[:split])*100

98.25

### Testing accuracy

In [24]:
nltk.classify.util.accuracy(sentiment_classifier, positive_features[split:]+negative_features[split:])*100

73.5

Our ** training accuracy ** is * 98.25% * and ** test accuracy ** is * 73.5% *

so, using naivebayes model we get ** overfit ** model

### Manually giving input to see output

In [25]:
sentiment_classifier.classify(bag_of_words("amazing film".split()))

'pos'

In [26]:
sentiment_classifier.classify(bag_of_words("good film".split()))

'pos'

In [27]:
sentiment_classifier.classify(bag_of_words("best movie".split()))

'pos'

In [28]:
sentiment_classifier.classify(bag_of_words("good movie".split()))

'neg'

In [29]:
sentiment_classifier.classify(bag_of_words("worst film".split()))

'neg'

In [30]:
sentiment_classifier.classify(bag_of_words("bad film".split()))

'neg'

In [31]:
sentiment_classifier.classify(bag_of_words("felling delightful after seeing this kind of movie".split()))

'pos'

### Using SVM Classifier  model

In [0]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

svc_classifier = SklearnClassifier(SVC(kernel='sigmoid'))

In [34]:
svc_classifier.train(positive_features[:split]+negative_features[:split])

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [35]:
nltk.classify.util.accuracy(svc_classifier, positive_features[:split]+negative_features[:split])*100

78.6875

In [36]:
nltk.classify.util.accuracy(svc_classifier, positive_features[split:]+negative_features[split:])*100

73.25

 Now our svm classifier model doesn't have ** overfit **