# Document Classification

This notebook provides a simple example of Document Classification. 

In [1]:
import nltk
from nltk.corpus import movie_reviews
import random

In [2]:
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

In [10]:
random.shuffle(documents)

In [12]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document): 
    document_words = set(document) 
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [13]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [14]:
print(nltk.classify.accuracy(classifier, test_set))

0.85


In [15]:
 classifier.show_most_informative_features(5)

Most Informative Features
    contains(schumacher) = True              neg : pos    =      7.4 : 1.0
        contains(shoddy) = True              neg : pos    =      7.0 : 1.0
          contains(mena) = True              neg : pos    =      7.0 : 1.0
 contains(unimaginative) = True              neg : pos    =      7.0 : 1.0
        contains(suvari) = True              neg : pos    =      7.0 : 1.0
