# Data Analysis of Movie Review using Natural Language Processing 
> A tutorial of Data Analysis for Movie Review using NLTK.

- toc: true
- badges: true
- comments: true
- categories: [ntlk, jupyter, python, movie-review, natual Language Processing]


In [1]:
import nltk

%config Completer.use_jedi = False

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# nltk.download('movie_reviews')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

In [2]:
from nltk.corpus import movie_reviews
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
len(documents)
# documents
documents[1000][1]

2000

'pos'

In [3]:
import random
random.shuffle(documents)
# documents

In [4]:
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer

In [5]:
# list ... append
# set ... update

In [6]:
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words('english'))
# stops
# type(stops)
punctuations = list(string.punctuation)
# punctuations
stops.update(punctuations)


def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def clean_review(words):
    clean_words = []
    for word in words:
        if word.lower() not in stops:
            pos = pos_tag([word])[0][1]
            clean_word = lemmatizer.lemmatize(word, pos=get_simple_pos(pos))
            clean_words.append(clean_word.lower())
    return clean_words

In [7]:
# for doc, cat in documents:
#     print(f'this is documnet: {doc} and this is categories: {cat}')

In [10]:
docs = [(clean_review(document), category) for document, category in documents]
# docs

In [11]:
all_words = []
for tup in docs:
    all_words += tup[0]
len(all_words)   

710579

In [12]:
import nltk
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
# common
features = [i[0] for i in common]
# features

In [13]:
training_documents = docs[0:1500] # %75 out of 2000 row
testing_documents = docs[1500:] # %25 out of 2000 row
# training_documents

In [14]:
def get_feature_dict(words): # words==doc
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set
    return current_features

In [15]:
lst = ['Abbas', 'Sharel']
'Sharel' in lst

True

In [19]:
training_data = [(get_feature_dict(doc), category)
                 for doc, category in training_documents]
# training_data[0]

In [20]:
testing_data = [(get_feature_dict(doc), category)
                for doc, category in testing_documents]
# testing_data

In [21]:
from nltk import NaiveBayesClassifier
clf = NaiveBayesClassifier.train(training_data)

In [22]:
nltk.classify.accuracy(clf, testing_data)

0.782

In [25]:
clf.show_most_informative_features(5) # how manytimes happen in (category (neg/pos)) : against the other category(pos/neg)

Most Informative Features
                 idiotic = True              neg : pos    =      8.9 : 1.0
             outstanding = True              pos : neg    =      8.8 : 1.0
               ludicrous = True              neg : pos    =      8.0 : 1.0
                    zeta = True              neg : pos    =      7.5 : 1.0
                  castle = True              pos : neg    =      7.5 : 1.0


In [26]:
# clf.classify_many([tup[0] for tup in testing_data])