In [11]:
import nltk
import random
from nltk.corpus import movie_reviews
import pickle

In [2]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

In [3]:
all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

In [4]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [5]:
# print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

In [6]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [7]:
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]

In [8]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [9]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

('Classifier accuracy percent:', 67.0)


In [10]:
classifier.show_most_informative_features(15)

Most Informative Features
               insulting = True              neg : pos    =     10.3 : 1.0
                 wasting = True              neg : pos    =      8.4 : 1.0
            refreshingly = True              pos : neg    =      7.6 : 1.0
                    sans = True              neg : pos    =      7.1 : 1.0
              mediocrity = True              neg : pos    =      7.1 : 1.0
               dismissed = True              pos : neg    =      6.9 : 1.0
                  fabric = True              pos : neg    =      6.3 : 1.0
               uplifting = True              pos : neg    =      6.1 : 1.0
                  stinks = True              neg : pos    =      5.9 : 1.0
             bruckheimer = True              neg : pos    =      5.7 : 1.0
                   wires = True              neg : pos    =      5.7 : 1.0
                 matthau = True              neg : pos    =      5.7 : 1.0
                  doubts = True              pos : neg    =      5.7 : 1.0

### Save the classifier

In [12]:
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

### Use saved classifier

In [14]:
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

In [15]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

('Classifier accuracy percent:', 67.0)
