In [4]:
import pandas as pd
import numpy as np
import os
import nltk
from collections import Counter
from nltk.tokenize import wordpunct_tokenize
from nltk import bigrams
from nltk import trigrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.cross_validation import KFold, train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from pymongo import MongoClient
from code.util import Util

In [15]:
n_features = 100

#Lists for X and y
review_list = []
opinion_list = []
word_list = []
word_list1 = []

#PyMongo variables
client = MongoClient()
db = client['reviews']
collection = db['movies']
reviews = collection.find()

for review in reviews:
    opinion_list.append(review['Opinion'])
    review_list.append(review['Review'])

vectorizer = TfidfVectorizer(max_features=n_features, stop_words='english'\
                , ngram_range=(1,3), sublinear_tf=True, norm='l2')
review_tfidf = vectorizer.fit_transform(review_list)
opinion_array = np.array(opinion_list)

feats_train, feats_test, opinions_train, opinions_test = train_test_split(\
                    review_tfidf, opinion_array)
kf = KFold(feats_train.shape[0], n_folds=5)

utility = Util()
train_accuracy, test_accuracy = utility.CalculateAccuracy(BaggingClassifier(),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bagging (Trees)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(BaggingClassifier(base_estimator=LogisticRegression()),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bagging (Logit)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(BaggingClassifier(base_estimator=SVC()),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bagging (SVC)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(BaggingClassifier(base_estimator=BernoulliNB()),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bagging (NB)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

Bagging (Trees)
Train Accuracy : 0.626666666667
Test Accuracy : 0.6124
Bagging (Logit)
Train Accuracy : 0.709333333333
Test Accuracy : 0.6944
Bagging (SVC)
Train Accuracy : 0.51
Test Accuracy : 0.47
Bagging (NB)
Train Accuracy : 0.667333333333
Test Accuracy : 0.6444


In [13]:
train_accuracy, test_accuracy = utility.CalculateAccuracy(AdaBoostClassifier(),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'AdaBoost (Trees)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(AdaBoostClassifier(base_estimator=SVC(),algorithm='SAMME'),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'AdaBoost (SVC)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(AdaBoostClassifier(base_estimator=BernoulliNB()),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'AdaBoost (NB)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

AdaBoost (Trees)
Train Accuracy : 0.66
Test Accuracy : 0.6692
AdaBoost (SVC)
Train Accuracy : 0.458666666667
Test Accuracy : 0.5008
AdaBoost (NB)
Train Accuracy : 0.533333333333
Test Accuracy : 0.5084


In [7]:
utility = Util()
train_accuracy, test_accuracy = utility.CalculateAccuracy(BernoulliNB(),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bernoulli NB'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(\
    LogisticRegression(),5, feats_train, opinions_train,\
    feats_test, opinions_test)
print 'Logistic Regression'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(\
    SVC(),5, feats_train, opinions_train, feats_test,\
    opinions_test)
print 'SVC'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

Bernoulli NB
Train Accuracy : 0.663333333333
Test Accuracy : 0.6544
Logistic Regression
Train Accuracy : 0.699333333333
Test Accuracy : 0.6976
SVC
Train Accuracy : 0.466
Test Accuracy : 0.4988


In [None]:
feats_train, feats_test, opinions_train, opinions_test = train_test_split(\
                    review_list, opinion_array)

for i,review in enumerate(feats_train):
    review_words = wordpunct_tokenize(review)
    word_list.append((Counter(review_words),opinions_train[i]))

classifier = nltk.NaiveBayesClassifier.train(word_list)

for j,review in enumerate(feats_test):
    review_words1 = wordpunct_tokenize(review)
    word_list1.append((Counter(review_words1),opinions_test[j]))

print nltk.classify.accuracy(classifier, word_list1)