# Data Analysis of Movie Review using Natural Language Processing 
> A tutorial of Data Analysis for Movie Review using NLTK.

- toc: true
- badges: true
- comments: true
- categories: [ntlk, jupyter, python, movie-review, natual Language Processing]


In [18]:
import nltk

nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
from nltk.corpus import movie_reviews
documents=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))

In [0]:
import random
random.shuffle(documents)


In [0]:
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
stops=set(stopwords.words('english'))
punctuations=list(string.punctuation)
stops.update(punctuations)
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
def clean_review(words):
    clean_words=[]
    for word in words:
        if word.lower() not in stops:
            pos=pos_tag([word])[0][1]
            clean_word=lemmatizer.lemmatize(word, pos=get_simple_pos(pos))
            clean_words.append(clean_word.lower())
    return clean_words

In [0]:
docs=[(clean_review(document), category) for document, category in documents]

In [0]:
all_words=[]
for tup in docs:
    all_words+=tup[0]

In [22]:
import nltk
freq=nltk.FreqDist(all_words)
common=freq.most_common(3000)
features=[i[0] for i in common]
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'much',
 'come',
 'also',
 'bad',
 'give',
 'life',
 'two',
 'look',
 'way',
 'know',
 'seem',
 'first',
 'end',
 '--',
 'year',
 'work',
 'thing',
 'plot',
 'say',
 'play',
 'really',
 'little',
 'show',
 'people',
 'could',
 'man',
 'star',
 'love',
 'never',
 'try',
 'great',
 'director',
 'best',
 'performance',
 'new',
 'big',
 'many',
 'action',
 'actor',
 'want',
 'u',
 'watch',
 'find',
 'think',
 'role',
 'act',
 'another',
 'back',
 'audience',
 'something',
 'world',
 'turn',
 'still',
 'day',
 'old',
 'set',
 'however',
 'use',
 'every',
 'begin',
 'though',
 'guy',
 'part',
 'comedy',
 'feel',
 'cast',
 'real',
 'enough',
 'around',
 'point',
 'interest',
 'last',
 'run',
 'write',
 'young',
 'may',
 'fact',
 'name',
 'long',
 'funny',
 'script',
 'actually',
 'right',
 'minute',
 'woman',
 'effect',
 'almost',
 'lot'

In [0]:
training_documents=docs[0:1500]
testing_documents=docs[1500:]


In [0]:
def get_feature_dict(words):
    current_features={}
    words_set=set(words)
    for w in features:
        current_features[w]=w in words_set
    return current_features

In [0]:
training_data=[(get_feature_dict(doc), category)for doc, category in training_documents]

In [0]:
testing_data=[(get_feature_dict(doc), category)for doc, category in testing_documents]

In [0]:
from nltk import NaiveBayesClassifier
clf=NaiveBayesClassifier.train(training_data)

In [29]:
nltk.classify.accuracy(clf, testing_data)

0.834

In [30]:
clf.show_most_informative_features(500)

Most Informative Features
             outstanding = True              pos : neg    =     10.3 : 1.0
              uninspired = True              neg : pos    =      9.9 : 1.0
                  seagal = True              neg : pos    =      8.7 : 1.0
               ludicrous = True              neg : pos    =      8.2 : 1.0
               stupidity = True              neg : pos    =      7.9 : 1.0
            breathtaking = True              pos : neg    =      6.9 : 1.0
                 idiotic = True              neg : pos    =      6.7 : 1.0
                  castle = True              pos : neg    =      6.7 : 1.0
                  bottle = True              neg : pos    =      6.5 : 1.0
                   damon = True              pos : neg    =      6.4 : 1.0
                   mulan = True              pos : neg    =      6.0 : 1.0
                   poker = True              pos : neg    =      6.0 : 1.0
                  turkey = True              neg : pos    =      5.9 : 1.0

In [31]:
clf.classify_many([tup[0] for tup in testing_data])

['pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
