In [4]:
# Import necessary libraries
import nltk
from nltk.corpus import movie_reviews
import random

In [6]:
# Construct list of documents labeled with categories
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

# Rearrage documents in random order
random.shuffle(documents)

In [7]:
# Calculate frequency distribution of each word
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

# Limit number of features to 2000 most frequent words in the corpus
word_features = list(all_words)[:2000]

# Define feature extractor to check for presence of words
# in any given document
def document_features(document):
    
    # Faster to check set of words than list of words
    document_words = set(document)
    
    features = {}
    
    for word in word_features:
        
        features['contains({})'.format(word)] = (word in document_words)
        
    return features

In [12]:
# Train Naive Bayes classifier

featuresets = [(document_features(d), c) for (d, c) in documents]

train_set, test_set = featuresets[100:], featuresets[:100]

classifier = nltk.NaiveBayesClassifier.train(train_set)

In [13]:
# Test the accuracy of the classifier

print(nltk.classify.accuracy(classifier, test_set))

0.71


In [15]:
# Show top 10 most important features as interpreted by the classifier
# pos : neg is ratio (i.e. "pos : neg = 5.0 : 1.0"
# means 5 times more likely to be a positive word)

classifier.show_most_informative_features(10)

Most Informative Features
      contains(explores) = True              pos : neg    =     12.4 : 1.0
         contains(moody) = True              pos : neg    =      9.7 : 1.0
 contains(unimaginative) = True              neg : pos    =      7.7 : 1.0
       contains(martian) = True              neg : pos    =      7.7 : 1.0
    contains(schumacher) = True              neg : pos    =      7.0 : 1.0
          contains(mena) = True              neg : pos    =      7.0 : 1.0
        contains(suvari) = True              neg : pos    =      7.0 : 1.0
     contains(atrocious) = True              neg : pos    =      7.0 : 1.0
        contains(neatly) = True              pos : neg    =      6.3 : 1.0
       contains(singers) = True              pos : neg    =      6.3 : 1.0


In [19]:
featuresets[0]

({'contains(plot)': False,
  'contains(:)': True,
  'contains(two)': False,
  'contains(teen)': False,
  'contains(couples)': False,
  'contains(go)': False,
  'contains(to)': True,
  'contains(a)': True,
  'contains(church)': False,
  'contains(party)': False,
  'contains(,)': True,
  'contains(drink)': False,
  'contains(and)': True,
  'contains(then)': True,
  'contains(drive)': False,
  'contains(.)': True,
  'contains(they)': True,
  'contains(get)': False,
  'contains(into)': True,
  'contains(an)': True,
  'contains(accident)': False,
  'contains(one)': True,
  'contains(of)': True,
  'contains(the)': True,
  'contains(guys)': False,
  'contains(dies)': True,
  'contains(but)': True,
  'contains(his)': True,
  'contains(girlfriend)': False,
  'contains(continues)': False,
  'contains(see)': True,
  'contains(him)': True,
  'contains(in)': True,
  'contains(her)': True,
  'contains(life)': False,
  'contains(has)': True,
  'contains(nightmares)': False,
  'contains(what)': True,
