In [1]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
import sys
import os
import time

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

from nltk.tokenize import word_tokenize
import codecs
import collections

import random


sys.path.append("/Users/Bya/git/predictEPL/utils/")
sys.path.append("/Users/Bya/git/predictEPL/config/")
sys.path.append("/Users/Bya/git/predictEPL/WebScrapping/")

import paths
import useful_methods

### Data Preprocessing

In [2]:
# load data
short_pos = codecs.open(paths.READ_PATH_REVIEW_SHORT + "positive1.txt", encoding='utf-8').read()
short_neg = codecs.open(paths.READ_PATH_REVIEW_SHORT + "negative1.txt", encoding='utf-8').read()

In [3]:
# create labeled documents
documents = []

for r in short_pos.split('\n'):
    documents.append( (r, "pos") )

short_pos_len = len(documents)

for r in short_neg.split('\n'):
    documents.append( (r, "neg") )

short_neg_len = len(documents) - short_pos_len

print("Short Positive Reviews: ", short_pos_len)
print ("Short Negative Reviews: ", short_neg_len)

Short Positive Reviews:  5332
Short Negative Reviews:  5332


### Word Tokenize

In [4]:
# create all words lists
all_words = []

short_pos_words = useful_methods.tokenizer(short_pos)
short_neg_words = useful_methods.tokenizer(short_neg)

for w in short_pos_words:
    all_words.append(w.lower())

for w in short_neg_words:
    all_words.append(w.lower())

print("All words number: ", len(all_words))

All words number:  115953


### Get most frequent 5000 words

In [5]:
all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[0:5000]

In [6]:
print("All unique words: ", len(all_words))

All unique words:  14456


In [7]:
save_word_features = open(paths.READ_PATH_REVIEW_SHORT + "word_features5k_shortReviews.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()

### Find features

In [8]:
def find_features(document):
    words = useful_methods.tokenizer(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

### Create all feature sets

In [9]:
start_time = time.time()

featuresets = [(find_features(rev), category) for (rev, category) in documents]

print("%.2f" %(time.time() -start_time))

28.68


In [10]:
# Shuffle all sets
random.shuffle(featuresets)

### Split into Train and Test set

In [11]:
training_set = featuresets[:8000]
testing_set =  featuresets[8000:]

print("Training data: ", len(training_set))
print("Testing data: ", len(testing_set))

Training data:  8000
Testing data:  2664


### Original Naive Bayes

In [12]:
start_time = time.time()

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("%.2f" %(time.time() -start_time))

55.84


In [13]:
start_time = time.time()

print("Original Naive Bayes algo accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

print("%.2f" %(time.time() -start_time))

Original Naive Bayes algo accuracy percent: 68.20570570570571
82.60


In [14]:
classifier.show_most_informative_features(20)

Most Informative Features
                   intim = True              pos : neg    =     14.6 : 1.0
                    bore = True              neg : pos    =     13.9 : 1.0
                 refresh = True              pos : neg    =     13.9 : 1.0
                  tender = True              pos : neg    =     11.8 : 1.0
                    flat = True              neg : pos    =     11.2 : 1.0
                   dazzl = True              pos : neg    =     11.2 : 1.0
                     son = True              pos : neg    =     11.2 : 1.0
                  stupid = True              neg : pos    =     10.8 : 1.0
                  clumsi = True              neg : pos    =      8.2 : 1.0
                     gem = True              pos : neg    =      7.8 : 1.0
                polanski = True              pos : neg    =      7.8 : 1.0
                 shallow = True              neg : pos    =      7.5 : 1.0
                    sink = True              neg : pos    =      7.5 : 1.0

### Demo of Classifier

In [15]:
example_pos = short_pos.split('\n')[random.randint(0, len(short_pos.split('\n')) - 1)]
example_neg = short_neg.split('\n')[random.randint(0, len(short_neg.split('\n')) - 1)]

print("Positive Review: \n", example_pos)
print("Result of Naive Bayes Classifier: \n", classifier.classify(find_features(example_pos)))

print("\nNegative Review: \n", example_neg)
print("Result of Naive Bayes Classifier: \n", classifier.classify(find_features(example_neg)))

Positive Review: 
 there are some wonderfully fresh moments that smooth the moral stiffness with human kindness and hopefulness . 
Result of Naive Bayes Classifier: 
 pos

Negative Review: 
 a broad , melodramatic estrogen opera that's pretty toxic in its own right . 
Result of Naive Bayes Classifier: 
 neg


### Save Classifier

In [16]:
save_classifier = open(paths.READ_PATH_REVIEW_SHORT + "naiveBayes_for_short_reviews_5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

### Using save Classifier

In [17]:
saved_classifier_f = open(paths.READ_PATH_REVIEW_SHORT + "naiveBayes_for_short_reviews_5k.pickle", "rb")
saved_classifier = pickle.load(saved_classifier_f)
saved_classifier_f.close()

In [18]:
start_time = time.time()

print("Positive Review: \n", example_pos)
print("Result of Naive Bayes Classifier: \n", classifier.classify(find_features(example_pos)))

print("\n[Passed Time]:%.2f" %(time.time() -start_time))

Positive Review: 
 there are some wonderfully fresh moments that smooth the moral stiffness with human kindness and hopefulness . 
Result of Naive Bayes Classifier: 
 pos

[Passed Time]:0.04
