# NLTK Movie Reviews - Naive Bayes Classifier

* I'll train a classifier using nltk corpus 'movie_reviews' and the Statistic Classifier 'Naive Bayes'

In [1]:
# Imports
import nltk
import random
import pickle
from nltk.corpus import movie_reviews

In [2]:
# Getting the documents and their category from corpus
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle
random.shuffle(documents)

In [3]:
# List of all words of corpus
all_words = [word.lower() for word in movie_reviews.words()]

# Count the frequence of each word in corpus
all_words = nltk.FreqDist(all_words)
all_words

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

In [4]:
# Preparing the features to train the model
word_features = all_words.keys()

# Fuction to get features from a document/review
def find_features(document):
    '''
    This fuction get a document/review and then compare what words from list of features is on document.
    '''
    words_doc = set(document)
    features = {}
    for w_f in word_features:
        features[w_f] = (w_f in words_doc)

    return features

In [5]:
# Creating the features set for all documents
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [6]:
# Train Data. 95% for train
training_set = featuresets[:(len(featuresets)*95)//100]

# Test Data. 5% for test
testing_set = featuresets[(len(featuresets)*95)//100:]

In [7]:
# Creating and train the classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [8]:
# Getting the Accuracy
print("\nModel Accuracy: ")
print("Classifier Accuracy:",(nltk.classify.accuracy(classifier, testing_set))*100,"%")


Model Accuracy: 
Classifier Accuracy: 84.0 %


In [12]:
# Most infromative features
classifier.show_most_informative_features(15)

Most Informative Features
               ludicrous = True              neg : pos    =     14.6 : 1.0
                  avoids = True              pos : neg    =     12.6 : 1.0
              astounding = True              pos : neg    =     11.4 : 1.0
             outstanding = True              pos : neg    =     11.1 : 1.0
             fascination = True              pos : neg    =     10.7 : 1.0
                    3000 = True              neg : pos    =     10.6 : 1.0
               insulting = True              neg : pos    =     10.5 : 1.0
                   sucks = True              neg : pos    =     10.1 : 1.0
                  hatred = True              pos : neg    =     10.1 : 1.0
                    slip = True              pos : neg    =     10.1 : 1.0
                  hudson = True              neg : pos    =      9.9 : 1.0
                  stinks = True              neg : pos    =      9.9 : 1.0
                  annual = True              pos : neg    =      9.4 : 1.0

In [None]:
# Saving the classifier
'''
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()
'''

In [None]:
# Load the classifier
'''
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
'''