In [1]:
# Simple Movie Reviews Classification using Naive Bayes Algorithm.
# Given two choices and they are labelled/tagged as +ve or -ve we can classify data.

import nltk
import random
from nltk.corpus import movie_reviews

# Words as features; make list of tuples
# Features: document with words marked as +ve and -ve
# Words with tag of +ve and -ve used for classification
documents = [(list(movie_reviews.words(fileid)),category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)
#print(documents[1])


# Make list of all words available in the movie reviews
# See if the classified words in documents in all_words; classify that text as +ve or -ve based on number of occurences
all_words = []
for w in movie_reviews.words():                 # Returns all Text as words
    all_words.append(w.lower())             # Normalization; convert text to lowercase


# Returns the frequence of occurance of each word
all_words = nltk.FreqDist(all_words)

#print(all_words.most_common(15))
#print(all_words['stupid'])                  # Returns Number of occurences of a word


word_features = list(all_words.keys())[:3000]   # Take first 3000 word occurences


# In a document, check for occurences of words in word_features[first 3000 words]
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)          # if word present in list of 3000 words, return True else False
    return features
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))              # Find occurence of words in word_features in negative reviews
print('\n')
featuresets = [(find_features(rev),category) for (rev,category) in documents]


# Using Naive Baye's Classifier for Classification
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

# posterior = prior occurences x likelihood / evidence
clf = nltk.NaiveBayesClassifier.train(training_set)                             # Using Naive Baye's classifier to Train on Data
print('Naive Bayes Accuracy percent: ',(nltk.classify.accuracy(clf,testing_set))*100)    # Making accuracy Predictions on Testing Data

clf.show_most_informative_features(15)                                          # show first 15 most informative features




Naive Bayes Accuracy percent:  69.0
Most Informative Features
                   dread = True              pos : neg    =      9.7 : 1.0
                 offbeat = True              pos : neg    =      7.4 : 1.0
                supports = True              pos : neg    =      7.0 : 1.0
                  coyote = True              neg : pos    =      7.0 : 1.0
                  denial = True              pos : neg    =      7.0 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
             overwrought = True              neg : pos    =      7.0 : 1.0
                 insipid = True              neg : pos    =      6.6 : 1.0
                headache = True              neg : pos    =      6.2 : 1.0
                  elliot = True              pos : neg    =      6.2 : 1.0
               uplifting = True              pos : neg    =      5.9 : 1.0
                   flock = True              pos : neg    =      5.7 : 1.0
                  guinea = True     