# Data Analysis of Movie Review using Natural Language Processing 
> A tutorial of Data Analysis for Movie Review using NLTK.

- toc: true
- badges: true
- comments: true
- categories: [ntlk, jupyter, python, movie-review, natual Language Processing]


In [1]:
import nltk

%config Completer.use_jedi = False

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# nltk.download('movie_reviews')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

In [2]:
from nltk.corpus import movie_reviews
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
# len(documents)
# documents
# documents[1110][1]

In [3]:
import random
random.shuffle(documents)
# documents

In [4]:
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer

In [5]:
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words('english'))
# stops
# type(stops)
punctuations = list(string.punctuation)
# punctuations
stops.update(punctuations)


def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def clean_review(words):
    clean_words = []
    for word in words:
        if word.lower() not in stops:
            pos = pos_tag([word])[0][1]
            clean_word = lemmatizer.lemmatize(word, pos=get_simple_pos(pos))
            clean_words.append(clean_word.lower())
    return clean_words

In [18]:
docs = [(clean_review(document), category) for document, category in documents]
# docs

In [19]:
all_words = []
for tup in docs:
    all_words += tup[0]
# all_words   

In [29]:
import nltk
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
# common
features = [i[0] for i in common]
# features

In [30]:
training_documents = docs[0:1500]
testing_documents = docs[1500:]
# training_documents

In [31]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [40]:
training_data = [(get_feature_dict(doc), category)
                 for doc, category in training_documents]
# training_data[0]

In [33]:
testing_data = [(get_feature_dict(doc), category)
                for doc, category in testing_documents]
# testing_data

In [34]:
from nltk import NaiveBayesClassifier
clf = NaiveBayesClassifier.train(training_data)

In [35]:
nltk.classify.accuracy(clf, testing_data)

0.82

In [43]:
clf.show_most_informative_features(50) # how manytimes happen in (category (neg/pos)) : against the other category(pos/neg)

Most Informative Features
               ludicrous = True              neg : pos    =     12.6 : 1.0
                  seagal = True              neg : pos    =     11.0 : 1.0
               stupidity = True              neg : pos    =     10.2 : 1.0
             outstanding = True              pos : neg    =      8.5 : 1.0
             beautifully = True              pos : neg    =      8.1 : 1.0
                    lame = True              neg : pos    =      7.8 : 1.0
            breathtaking = True              pos : neg    =      7.3 : 1.0
                   mulan = True              pos : neg    =      7.0 : 1.0
                   awful = True              neg : pos    =      6.9 : 1.0
                 garbage = True              neg : pos    =      6.3 : 1.0
                   anger = True              pos : neg    =      6.1 : 1.0
             wonderfully = True              pos : neg    =      6.1 : 1.0
                  mature = True              pos : neg    =      5.9 : 1.0

In [37]:
clf.classify_many([tup[0] for tup in testing_data])

['pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
