In [108]:
import pandas as pd
import numpy as np
import os
import nltk
from collections import Counter
from nltk.tokenize import wordpunct_tokenize
from nltk import bigrams
from nltk import trigrams
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import NMF
from sklearn.cross_validation import KFold, train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
                    f1_score
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from pymongo import MongoClient
from code.util import Util

In [109]:
#Lists for X and y
review_list, opinion_list, sentiword_list, sentiment_list, word_list,\
            pos, neg = [],[],[],[],[],[],[]
vocab = {}
    
#PyMongo variables
client = MongoClient()
db = client['reviews']
collection = db['movies']
reviews = collection.find()

db1 = client['sentiment']
collection1 = db1['bingliu']
sentiments = collection1.find()

#build review and label lists
for review in reviews:
    opinion_list.append(review['Opinion'])
    review_list.append(review['Review'])

#build sentiment word, sentiment polarity, pos word, neg word
for sentiment in sentiments:
    sentiword_list.append(sentiment['Word'])
    sentiment_list.append(sentiment['Sentiment'])
    if sentiment['Sentiment'] == 1:
        pos.append(sentiment['Word'])
    elif sentiment['Sentiment'] == -1:
        neg.append(sentiment['Word'])

In [111]:
vectorizer = TfidfVectorizer(decode_error='replace',strip_accents='unicode',\
                            vocabulary = sentiword_list, lowercase=True)
review_tf = vectorizer.fit_transform(review_list)
review_sf = review_tf.copy()

#for every review
for i, review_s in enumerate(review_tf):
    #for every index (word) in the review
    for idx in review_tf[i].indices:
        if vectorizer.vocabulary[idx] in neg:
            review_sf[i, idx] = review_tf[i, idx]*-1
        elif vectorizer.vocabulary[idx] in pos:
            review_sf[i, idx] = review_tf[i, idx]

X = review_sf.sum(axis=1)

feats_train, feats_test, opinions_train, opinions_test = train_test_split(\
                    review_sf, opinion_array)
kf = KFold(feats_train.shape[0], n_folds=5)

utility = Util()
train_accuracy, test_accuracy = utility.CalculateAccuracy(BernoulliNB(),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bernoulli NB'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(\
    LogisticRegression(),5, feats_train, opinions_train,\
    feats_test, opinions_test)
print 'Logistic Regression'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(\
    SVC(),5, feats_train, opinions_train, feats_test,\
    opinions_test)
print 'SVC'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

Bernoulli NB
Train Accuracy : 0.712666666667
Test Accuracy : 0.698
Logistic Regression
Train Accuracy : 0.835333333333
Test Accuracy : 0.84
SVC
Train Accuracy : 0.475333333333
Test Accuracy : 0.498


In [112]:
vectorizer = CountVectorizer(decode_error='replace',strip_accents='unicode',\
                            vocabulary = sentiword_list)
review_tf = vectorizer.fit_transform(review_list)

feats_train, feats_test, opinions_train, opinions_test = train_test_split(\
                    review_tf, opinion_array)
kf = KFold(feats_train.shape[0], n_folds=5)

utility = Util()
train_accuracy, test_accuracy = utility.CalculateAccuracy(BernoulliNB(),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bernoulli NB'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(\
    LogisticRegression(),5, feats_train, opinions_train,\
    feats_test, opinions_test)
print 'Logistic Regression'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(\
    SVC(),5, feats_train, opinions_train, feats_test,\
    opinions_test)
print 'SVC'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

Bernoulli NB
Train Accuracy : 0.81
Test Accuracy : 0.806
Logistic Regression
Train Accuracy : 0.828
Test Accuracy : 0.8028
SVC
Train Accuracy : 0.554
Test Accuracy : 0.578


In [113]:
vectorizer = CountVectorizer(decode_error='replace',strip_accents='unicode',\
                            vocabulary = sentiword_list)
review_tf = vectorizer.fit_transform(review_list)

# print review_tf[0].indices
review_sf = review_tf.copy()

#for every review
for i, review_s in enumerate(review_tf):
    #for every index (word) in the review
    for idx in review_tf[i].indices:
        if vectorizer.vocabulary[idx] in neg:
            review_sf[i, idx] = review_tf[i, idx]*-1
        elif vectorizer.vocabulary[idx] in pos:
            review_sf[i, idx] = review_tf[i, idx]

X = review_sf.sum(axis=1)

opinion_array = np.array(opinion_list)

feats_train, feats_test, opinions_train, opinions_test = train_test_split(\
                    X, opinion_array)
kf = KFold(feats_train.shape[0], n_folds=5)

utility = Util()
train_accuracy, test_accuracy = utility.CalculateAccuracy(BernoulliNB(),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bernoulli NB'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(\
    LogisticRegression(),5, feats_train, opinions_train,\
    feats_test, opinions_test)
print 'Logistic Regression'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(\
    SVC(),5, feats_train, opinions_train, feats_test,\
    opinions_test)
print 'SVC'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

Bernoulli NB
Train Accuracy : 0.699333333333
Test Accuracy : 0.7
Logistic Regression
Train Accuracy : 0.698666666667
Test Accuracy : 0.702
SVC
Train Accuracy : 0.694666666667
Test Accuracy : 0.696


In [94]:
utility = Util()
train_accuracy, test_accuracy = utility.CalculateAccuracy(BaggingClassifier(),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bagging (Trees)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(BaggingClassifier(base_estimator=LogisticRegression()),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bagging (Logit)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(BaggingClassifier(base_estimator=SVC()),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bagging (SVC)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(BaggingClassifier(base_estimator=BernoulliNB()),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'Bagging (NB)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

Bagging (Trees)
Train Accuracy : 0.693333333333
Test Accuracy : 0.7156
Bagging (Logit)
Train Accuracy : 0.689333333333
Test Accuracy : 0.73
Bagging (SVC)
Train Accuracy : 0.681333333333
Test Accuracy : 0.71
Bagging (NB)
Train Accuracy : 0.691333333333
Test Accuracy : 0.724


In [95]:
train_accuracy, test_accuracy = utility.CalculateAccuracy(AdaBoostClassifier(),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'AdaBoost (Trees)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(AdaBoostClassifier(base_estimator=SVC(),algorithm='SAMME'),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'AdaBoost (SVC)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

train_accuracy, test_accuracy = utility.CalculateAccuracy(AdaBoostClassifier(base_estimator=BernoulliNB()),\
            5, feats_train, opinions_train, feats_test, opinions_test)
print 'AdaBoost (NB)'
print 'Train Accuracy : {}'.format(train_accuracy)
print 'Test Accuracy : {}'.format(test_accuracy)

AdaBoost (Trees)
Train Accuracy : 0.7
Test Accuracy : 0.724
AdaBoost (SVC)
Train Accuracy : 0.474666666667
Test Accuracy : 0.4984
AdaBoost (NB)
Train Accuracy : 0.691333333333
Test Accuracy : 0.724


In [97]:
word_list1 = []

feats_train, feats_test, opinions_train, opinions_test = train_test_split(\
                    review_list, opinion_array)

for i,review in enumerate(feats_train):
    review_words = wordpunct_tokenize(review)
    word_list.append((Counter(review_words),opinions_train[i]))

classifier = nltk.NaiveBayesClassifier.train(word_list)

for j,review in enumerate(feats_test):
    review_words1 = wordpunct_tokenize(review)
    word_list1.append((Counter(review_words1),opinions_test[j]))

print nltk.classify.accuracy(classifier, word_list1)

0.73
