In [52]:
import pandas as pd
import numpy as np
import os
import nltk
from operator import itemgetter
from collections import Counter
from nltk.tokenize import wordpunct_tokenize
from nltk import bigrams
from nltk import trigrams
from mlxtend.classifier import EnsembleClassifier
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import NMF
from sklearn.cross_validation import KFold, train_test_split,\
                    cross_val_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
                    f1_score
from sklearn.grid_search import RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier,\
                            RandomForestClassifier
from pymongo import MongoClient
from code.util import Util

In [51]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

#Lists for X and y
review_list, opinion_list, sentiword_list, sentiment_list, word_list,\
            pos, neg = [],[],[],[],[],[],[]
vocab = {}
    
#PyMongo variables
client = MongoClient()
db = client['reviews']
collection = db['movies']
reviews = collection.find()

db1 = client['sentiment']
collection1 = db1['bingliu']
sentiments = collection1.find()

#build review and label lists
for review in reviews:
    opinion_list.append(review['Opinion'])
    review_list.append(review['Review'])

opinion_array = np.array(opinion_list)    

#build sentiment word, sentiment polarity, pos word, neg word
for sentiment in sentiments:
    sentiword_list.append(sentiment['Word'])
    sentiment_list.append(sentiment['Sentiment'])
    if sentiment['Sentiment'] == 1:
        pos.append(sentiment['Word'])
    elif sentiment['Sentiment'] == -1:
        neg.append(sentiment['Word'])

In [53]:
vectorizer = TfidfVectorizer(decode_error='replace',strip_accents='unicode',\
                            vocabulary = sentiword_list, lowercase=True)
review_tfidf = vectorizer.fit_transform(review_list)
review_sf = review_tfidf.copy()

#for every review
for i, review_s in enumerate(review_tfidf):
    #for every index (word) in the review
    for idx in review_tfidf[i].indices:
        if vectorizer.vocabulary[idx] in neg:
            review_sf[i, idx] = review_tfidf[i, idx]*-1
        elif vectorizer.vocabulary[idx] in pos:
            review_sf[i, idx] = review_tfidf[i, idx]

In [62]:
clf = LogisticRegression()

param_dist = {"penalty": ['l1', 'l2'],
              "class_weight": ["auto",None],
              "C": [1,.1,.01],
              "fit_intercept": [True,False]}

# run randomized search
n_iter_search = 24
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring='accuracy')
random_search.fit(review_sf, opinion_array)
report(random_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.837 (std: 0.010)
Parameters: {'penalty': 'l2', 'C': 1, 'fit_intercept': True, 'class_weight': 'auto'}

Model with rank: 2
Mean validation score: 0.837 (std: 0.010)
Parameters: {'penalty': 'l2', 'C': 1, 'fit_intercept': True, 'class_weight': None}

Model with rank: 3
Mean validation score: 0.837 (std: 0.011)
Parameters: {'penalty': 'l2', 'C': 1, 'fit_intercept': False, 'class_weight': 'auto'}



In [63]:
clf = RandomForestClassifier()

param_dist = {"n_estimators": [10, 100],
              "criterion": ["gini","entropy"],
              "max_features": ["auto","sqrt","log2"],
              "max_depth": [None,5,10]}

# run randomized search
n_iter_search = 36
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring='accuracy')
random_search.fit(review_sf, opinion_array)
report(random_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.818 (std: 0.012)
Parameters: {'max_features': 'auto', 'n_estimators': 100, 'criterion': 'entropy', 'max_depth': None}

Model with rank: 2
Mean validation score: 0.814 (std: 0.003)
Parameters: {'max_features': 'sqrt', 'n_estimators': 100, 'criterion': 'entropy', 'max_depth': None}

Model with rank: 3
Mean validation score: 0.810 (std: 0.009)
Parameters: {'max_features': 'sqrt', 'n_estimators': 100, 'criterion': 'gini', 'max_depth': None}



In [64]:
# Utility function to report best scores
clf = BernoulliNB()

param_dist = {"alpha": [0,.1,1],
              "fit_prior": [True,False]}

# run randomized search
n_iter_search = 6
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring='accuracy')
random_search.fit(review_sf, opinion_array)
report(random_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.724 (std: 0.025)
Parameters: {'alpha': 1, 'fit_prior': True}

Model with rank: 2
Mean validation score: 0.724 (std: 0.025)
Parameters: {'alpha': 1, 'fit_prior': False}

Model with rank: 3
Mean validation score: 0.712 (std: 0.024)
Parameters: {'alpha': 0.1, 'fit_prior': True}



In [66]:
clf1 = LogisticRegression('l2')
clf2 = RandomForestClassifier(n_estimators=100,criterion='entropy')
clf3 = BernoulliNB(alpha=1,fit_prior=True)
eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3], weights=[1,1,1])
for clf, label in zip([eclf], ['Ensemble']):
    scores = cross_val_score(clf, review_sf, opinion_array,\
                cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.84 (+/- 0.01) [Ensemble]


In [106]:
n_topics = 1
n_top_words = 1000

neg_vectorizer = TfidfVectorizer(decode_error='replace',strip_accents='unicode',\
                            vocabulary = neg, lowercase=True)
neg_tfidf = neg_vectorizer.fit_transform(review_list[0:1000])
pos_vectorizer = TfidfVectorizer(decode_error='replace',strip_accents='unicode',\
                            vocabulary = pos, lowercase=True)
pos_tfidf = pos_vectorizer.fit_transform(review_list[1000:2000])

neg_nmf = NMF(n_components=n_topics, random_state=1).fit(neg_tfidf)
pos_nmf = NMF(n_components=n_topics, random_state=1).fit(pos_tfidf)

neg_feature_names = neg_vectorizer.get_feature_names()
pos_feature_names = pos_vectorizer.get_feature_names()

for topic_idx, topic in enumerate(pos_nmf.components_):
    pos_nmf = [pos_feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]

for topic_idx, topic in enumerate(neg_nmf.components_):
    neg_nmf = [neg_feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]


In [107]:
nmfvectorizer = TfidfVectorizer(decode_error='replace',strip_accents='unicode',\
                    vocabulary = pos_nmf+neg_nmf, lowercase=True)
review_tfidf = nmfvectorizer.fit_transform(review_list)
review_sf = review_tfidf.copy()

#for every review
for i, review_s in enumerate(review_tfidf):
    #for every index (word) in the review
    for idx in review_tfidf[i].indices:
        if nmfvectorizer.vocabulary[idx] in neg_nmf:
            review_sf[i, idx] = review_tfidf[i, idx]*-1
        elif nmfvectorizer.vocabulary[idx] in pos:
            review_sf[i, idx] = review_tfidf[i, idx]

clf = LogisticRegression()

param_dist = {"penalty": ['l1', 'l2'],
              "class_weight": ["auto",None],
              "C": [1,.1,.01],
              "fit_intercept": [True,False]}

# run randomized search
n_iter_search = 24
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring='accuracy')
random_search.fit(review_sf, opinion_array)
report(random_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.831 (std: 0.009)
Parameters: {'penalty': 'l2', 'C': 1, 'fit_intercept': True, 'class_weight': 'auto'}

Model with rank: 2
Mean validation score: 0.831 (std: 0.009)
Parameters: {'penalty': 'l2', 'C': 1, 'fit_intercept': True, 'class_weight': None}

Model with rank: 3
Mean validation score: 0.829 (std: 0.012)
Parameters: {'penalty': 'l2', 'C': 1, 'fit_intercept': False, 'class_weight': 'auto'}



In [108]:
clf = RandomForestClassifier()

param_dist = {"n_estimators": [10, 100],
              "criterion": ["gini","entropy"],
              "max_features": ["auto","sqrt","log2"],
              "max_depth": [None,5,10]}

# run randomized search
n_iter_search = 36
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring='accuracy')
random_search.fit(review_sf, opinion_array)
report(random_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.821 (std: 0.017)
Parameters: {'max_features': 'log2', 'n_estimators': 100, 'criterion': 'entropy', 'max_depth': None}

Model with rank: 2
Mean validation score: 0.820 (std: 0.020)
Parameters: {'max_features': 'log2', 'n_estimators': 100, 'criterion': 'gini', 'max_depth': None}

Model with rank: 3
Mean validation score: 0.813 (std: 0.013)
Parameters: {'max_features': 'auto', 'n_estimators': 100, 'criterion': 'gini', 'max_depth': None}



In [109]:
# Utility function to report best scores
clf = BernoulliNB()

param_dist = {"alpha": [0,.1,1],
              "fit_prior": [True,False]}

# run randomized search
n_iter_search = 6
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring='accuracy')
random_search.fit(review_sf, opinion_array)
report(random_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.722 (std: 0.020)
Parameters: {'alpha': 1, 'fit_prior': True}

Model with rank: 2
Mean validation score: 0.722 (std: 0.020)
Parameters: {'alpha': 1, 'fit_prior': False}

Model with rank: 3
Mean validation score: 0.712 (std: 0.022)
Parameters: {'alpha': 0.1, 'fit_prior': True}



In [110]:
clf1 = LogisticRegression('l2')
clf2 = RandomForestClassifier(max_features='log2',n_estimators=100,\
                              criterion='entropy')
clf3 = BernoulliNB(alpha=0,fit_prior=True)
eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3], weights=[1,1,1])
for clf, label in zip([eclf], ['Ensemble']):
    scores = cross_val_score(clf, review_sf, opinion_array,\
                cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.84 (+/- 0.02) [Ensemble]
