In [1]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

from nltk.tokenize import word_tokenize
from unidecode import unidecode
import codecs
import collections

import sentiment_aware as sa
import random

### Data Preprocessing

In [2]:
# load data
short_pos = codecs.open("short_reviews/positive1.txt", encoding='utf-8').read()
short_neg = codecs.open("short_reviews/negative1.txt", encoding='utf-8').read()

In [3]:
# create labeled documents
documents = []

for r in short_pos.split('\n'):
    documents.append( (r, "pos") )

short_pos_len = len(documents)

for r in short_neg.split('\n'):
    documents.append( (r, "neg") )

short_neg_len = len(documents) - short_pos_len

print "Short Positive Reviews: ", short_pos_len
print "Short Negative Reviews: ", short_neg_len

Short Positive Reviews:  5332
Short Negative Reviews:  5332


### Word Tokenize

In [4]:
# Define Sentiment Tokenizer Class as tok
tok = sa.Tokenizer(preserve_case=False)

In [5]:
# create all words lists
all_words = []

short_pos_words = tok.tokenize(short_pos)
short_neg_words = tok.tokenize(short_neg)

for w in short_pos_words:
    all_words.append(w.lower())

for w in short_neg_words:
    all_words.append(w.lower())

print "All words number: ", len(all_words)

All words number:  225005


### Get most frequent 5000 words

In [6]:
all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

In [7]:
print "All unique words: ", len(all_words)

All unique words:  20679


In [51]:
save_word_features = open("pickled_algos/word_features5k_shortReviews.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()

### Find features

In [8]:
def find_features(document):
    words = tok.tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

### Create all feature sets

In [9]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [10]:
# Shuffle all sets
random.shuffle(featuresets)

### Split into Train and Test set

In [11]:
training_set = featuresets[:8000]
testing_set =  featuresets[8000:]

print "Training data: ", len(training_set)
print "Testing data: ", len(testing_set)

Training data:  8000
Testing data:  2664


### Original Naive Bayes

In [12]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [13]:
print("Original Naive Bayes algo accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

('Original Naive Bayes algo accuracy percent:', 65.27777777777779)


In [14]:
classifier.show_most_informative_features(20)

Most Informative Features
                mediocre = True              neg : pos    =     13.5 : 1.0
                   waste = True              neg : pos    =     11.5 : 1.0
                 winning = True              pos : neg    =     11.1 : 1.0
                delicate = True              pos : neg    =     10.5 : 1.0
             mesmerizing = True              pos : neg    =      9.1 : 1.0
              irritating = True              neg : pos    =      8.9 : 1.0
                mindless = True              neg : pos    =      8.9 : 1.0
                plotting = True              neg : pos    =      8.2 : 1.0
             pretentious = True              neg : pos    =      8.0 : 1.0
                  unique = True              pos : neg    =      7.9 : 1.0
                    loud = True              neg : pos    =      7.7 : 1.0
                 assured = True              pos : neg    =      7.1 : 1.0
             moviemaking = True              pos : neg    =      7.1 : 1.0

### Demo of Classifier

In [46]:
example_pos = short_pos.split('\n')[random.randint(0, len(short_pos.split('\n')) - 1)]
example_neg = short_neg.split('\n')[random.randint(0, len(short_neg.split('\n')) - 1)]

print "Positive Review: \n", example_pos
print "Result of Naive Bayes Classifier: \n", classifier.classify(find_features(example_pos))

print "\nNegative Review: \n", example_neg
print "Result of Naive Bayes Classifier: \n", classifier.classify(find_features(example_neg))

Positive Review: 
it may not be as cutting , as witty or as true as back in the glory days of weekend and two or three things i know about her , but who else engaged in filmmaking today is so cognizant of the cultural and moral issues involved in the process ? 
Result of Naive Bayes Classifier: 
pos

Negative Review: 
it should be mentioned that the set design and interiors of the haunted vessel are more than effectively creepy and moodily lit . so i just did . 
Result of Naive Bayes Classifier: 
neg


### Save Classifier

In [48]:
save_classifier = open("pickled_algos/naiveBayes_for_short_reviews.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

### Using save Classifier

In [49]:
saved_classifier_f = open("pickled_algos/naiveBayes_for_short_reviews.pickle", "rb")
saved_classifier = pickle.load(saved_classifier_f)
saved_classifier_f.close()

In [50]:
print("Saved Classifier accuracy percent:",(nltk.classify.accuracy(saved_classifier, testing_set))*100)

('Saved Classifier accuracy percent:', 65.27777777777779)
