In [11]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE

## Iteration # 1

In [12]:
# Load amazon review data
amazon = pd.read_csv("amazon_cells_labelled.txt", delimiter="\t", header=None, names=["review_", "score_"])
amazon["score_"] = amazon["score_"] == 1

# Find and add features
vect = CountVectorizer()
amazon_dtm = vect.fit_transform(amazon["review_"])
custom_feature_list = list(vect.get_feature_names())

for word in custom_feature_list:
    amazon[str(word)] = amazon["review_"].str.contains(' ' + str(word) + ' ', case=False)
amazon.head()

Unnamed: 0,review_,score_,10,100,11,12,13,15,15g,18,...,wrongly,year,years,yell,yes,yet,you,your,z500a,zero
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
# Build model
data = amazon[custom_feature_list]
target = amazon["score_"]
nb = BernoulliNB()
nb.fit(data, target)
y_pred = nb.predict(data)

# Evalute
print("Baseline performance: {}".format(len(amazon[amazon["score_"] == True]) / len(amazon["score_"])))
print("Accuracy score: {}".format(accuracy_score(target, y_pred)))

conf_mat = confusion_matrix(target, y_pred)
print("\nConfusion matrix:\n{}".format(conf_mat))

Baseline performance: 0.5
Accuracy score: 0.872

Confusion matrix:
[[393 107]
 [ 21 479]]


This model is really good at predicting positive reviews (479 correct out of 500), while there is some possible room for improvement in correctly predicting negative reviews.

Test this model again using a holdout group at a 50/50 split.

In [14]:
# Test using hold out
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.5)

y_pred_holdout = nb.fit(X_train, y_train).predict(X_test)
print("Accuracy score: {}".format(accuracy_score(y_test, y_pred_holdout)))

conf_mat = confusion_matrix(y_test, y_pred_holdout)
print("\nConfusion matrix:\n{}".format(conf_mat))

Accuracy score: 0.692

Confusion matrix:
[[196  40]
 [114 150]]


With the holdout group, the accuracy dropped from 0.872 to 0.692, which would indicate that it is overfitting. The model is still doing a good job predicting positive reviews while missing wildly on negative reviews.

For completeness, the model is rerun through cross-validation.

In [15]:
# Cross validation
cross_val_score(nb, data, target, cv=10)

array([0.79, 0.65, 0.71, 0.73, 0.71, 0.66, 0.75, 0.69, 0.7 , 0.69])

As expected, we see considerable fluctuation here (from 0.65 to 0.79) in accuracy scores. The model is overfitting.

## Iteration # 2
Limit the features to the top 100 per recursive feature elimition

In [16]:
# Use recursive feature elimination
selector = RFE(nb)
selector = selector.fit(data, target)
rankings = pd.DataFrame({'Features': data.columns, 'Ranking' : selector.ranking_})

rankings100 = rankings[rankings["Ranking"] <= 100]["Features"]
rankings100.head()

1    100
2     11
3     12
4     13
5     15
Name: Features, dtype: object

In [17]:
# Load amazon review data
amazon = pd.read_csv("amazon_cells_labelled.txt", delimiter="\t", header=None, names=["review_", "score_"])
amazon["score_"] = amazon["score_"] == 1

for word in rankings100:
    amazon[str(word)] = amazon["review_"].str.contains(' ' + str(word) + ' ', case=False)
amazon.head()

Unnamed: 0,review_,score_,100,11,12,13,15,15g,18,20,...,worth,worthless,worthwhile,wouldn,wow,wrong,wrongly,yell,yes,zero
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
# Build model
data = amazon[rankings100]
target = amazon["score_"]
nb = BernoulliNB()
nb.fit(data, target)
y_pred = nb.predict(data)

# Evalute
print("Baseline performance: {}".format(len(amazon[amazon["score_"] == True]) / len(amazon["score_"])))
print("Accuracy score: {}".format(accuracy_score(target, y_pred)))

conf_mat = confusion_matrix(target, y_pred)
print("\nConfusion matrix:\n{}".format(conf_mat))

Baseline performance: 0.5
Accuracy score: 0.706

Confusion matrix:
[[206 294]
 [  0 500]]


In [19]:
# Test using hold out
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.5)

y_pred_holdout = nb.fit(X_train, y_train).predict(X_test)
print("Accuracy score: {}".format(accuracy_score(y_test, y_pred_holdout)))

conf_mat = confusion_matrix(y_test, y_pred_holdout)
print("\nConfusion matrix:\n{}".format(conf_mat))

Accuracy score: 0.514

Confusion matrix:
[[ 17 243]
 [  0 240]]


Hitting all the positve reviews correctly, but almost missing all the negative reviews. 

In [20]:
# Cross validation
cross_val_score(nb, data, target, cv=10)

array([0.55, 0.54, 0.61, 0.59, 0.58, 0.56, 0.6 , 0.57, 0.6 , 0.55])

Overfitting seems to be less of a problem, but the fact that the model isn't able to classify negative sentiment reviews renders it all but useless.

## Iteration # 3
Limit the features to the top 50, using recursive feature elimination.

In [21]:
rankings50 = rankings[rankings["Ranking"] <= 50]["Features"]
rankings50.head()

1    100
2     11
3     12
4     13
5     15
Name: Features, dtype: object

In [22]:
# Load amazon review data
amazon = pd.read_csv("amazon_cells_labelled.txt", delimiter="\t", header=None, names=["review_", "score_"])
amazon["score_"] = amazon["score_"] == 1

for word in rankings50:
    amazon[str(word)] = amazon["review_"].str.contains(' ' + str(word) + ' ', case=False)
amazon.head()

Unnamed: 0,review_,score_,100,11,12,13,15,15g,18,20,...,worn,worst,worthless,wouldn,wow,wrong,wrongly,yell,yes,zero
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
# Build model
data = amazon[rankings50]
target = amazon["score_"]
nb = BernoulliNB()
nb.fit(data, target)
y_pred = nb.predict(data)

# Evalute
print("Baseline performance: {}".format(len(amazon[amazon["score_"] == True]) / len(amazon["score_"])))
print("Accuracy score: {}".format(accuracy_score(target, y_pred)))

conf_mat = confusion_matrix(target, y_pred)
print("\nConfusion matrix:\n{}".format(conf_mat))

Baseline performance: 0.5
Accuracy score: 0.702

Confusion matrix:
[[202 298]
 [  0 500]]


In [24]:
# Test using hold out
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.5)

y_pred_holdout = nb.fit(X_train, y_train).predict(X_test)
print("Accuracy score: {}".format(accuracy_score(y_test, y_pred_holdout)))

conf_mat = confusion_matrix(y_test, y_pred_holdout)
print("\nConfusion matrix:\n{}".format(conf_mat))

Accuracy score: 0.578

Confusion matrix:
[[ 27 211]
 [  0 262]]


In [25]:
# Cross validation
cross_val_score(nb, data, target, cv=10)

array([0.55, 0.54, 0.6 , 0.59, 0.58, 0.55, 0.59, 0.57, 0.6 , 0.54])

A small improvement over the previous iteration, but still doing very poorly at predicting negative reviews.

## Iteration # 4
Limit the features to the top 1, using recursive feature elimination.

In [26]:
rankings1 = rankings[rankings["Ranking"] <= 1]["Features"]
rankings1.head()

88        amp
92    angeles
93      angle
95     answer
97     antena
Name: Features, dtype: object

In [27]:
# Load amazon review data
amazon = pd.read_csv("amazon_cells_labelled.txt", delimiter="\t", header=None, names=["review_", "score_"])
amazon["score_"] = amazon["score_"] == 1

for word in rankings1:
    amazon[str(word)] = amazon["review_"].str.contains(' ' + str(word) + ' ', case=False)
amazon.head()

Unnamed: 0,review_,score_,amp,angeles,angle,answer,antena,anti,anything,anyway,...,worn,worst,worthless,wouldn,wow,wrong,wrongly,yell,yes,zero
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [28]:
# Build model
data = amazon[rankings1]
target = amazon["score_"]
nb = BernoulliNB()
nb.fit(data, target)
y_pred = nb.predict(data)

# Evalute
print("Baseline performance: {}".format(len(amazon[amazon["score_"] == True]) / len(amazon["score_"])))
print("Accuracy score: {}".format(accuracy_score(target, y_pred)))

conf_mat = confusion_matrix(target, y_pred)
print("\nConfusion matrix:\n{}".format(conf_mat))

Baseline performance: 0.5
Accuracy score: 0.697

Confusion matrix:
[[197 303]
 [  0 500]]


In [29]:
# Test using hold out
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.5)

y_pred_holdout = nb.fit(X_train, y_train).predict(X_test)
print("Accuracy score: {}".format(accuracy_score(y_test, y_pred_holdout)))

conf_mat = confusion_matrix(y_test, y_pred_holdout)
print("\nConfusion matrix:\n{}".format(conf_mat))

Accuracy score: 0.506

Confusion matrix:
[[ 18 247]
 [  0 235]]


In [30]:
# Cross validation
cross_val_score(nb, data, target, cv=10)

array([0.55, 0.54, 0.6 , 0.59, 0.58, 0.55, 0.59, 0.57, 0.6 , 0.54])

This is a bit worse than the iteration 3, but mainly the same story: the model is not predicting negative reviews.

## Iteration # 5 - Last ditch effort using online sentiment analysis word list
I found a [list of positve sentiment keywords](http://ptrckprry.com/course/ssd/data/positive-words.txt), a [list of negative sentiment keywords](http://ptrckprry.com/course/ssd/data/negative-words.txt), and I combined them into one csv file. This combined list becomes the model's features.

In [35]:
# Load amazon review data
amazon = pd.read_csv("amazon_cells_labelled.txt", delimiter="\t", header=None, names=["review_", "score_"])
amazon["score_"] = amazon["score_"] == 1

online_keywords = pd.read_csv("all_words.csv", header=None)

for word in online_keywords[0]:
    amazon[str(word)] = amazon["review_"].str.contains(' ' + str(word) + ' ', case=False)
amazon.head()

Unnamed: 0,review_,score_,a+,abound,abounds,abundance,abundant,accessable,accessible,acclaim,...,wrongly,wrought,yawn,zap,zapped,zaps,zealot,zealous,zealously,zombie
0,So there is no way for me to plug it in here i...,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [36]:
# Build model
data = amazon[online_keywords[0]]
target = amazon["score_"]
nb = BernoulliNB()
nb.fit(data, target)
y_pred = nb.predict(data)

# Evalute
print("Baseline performance: {}".format(len(amazon[amazon["score_"] == True]) / len(amazon["score_"])))
print("Accuracy score: {}".format(accuracy_score(target, y_pred)))

conf_mat = confusion_matrix(target, y_pred)
print("\nConfusion matrix:\n{}".format(conf_mat))

Baseline performance: 0.5
Accuracy score: 0.75

Confusion matrix:
[[467  33]
 [217 283]]


In [37]:
# Test using hold out
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.5)

y_pred_holdout = nb.fit(X_train, y_train).predict(X_test)
print("Accuracy score: {}".format(accuracy_score(y_test, y_pred_holdout)))

conf_mat = confusion_matrix(y_test, y_pred_holdout)
print("\nConfusion matrix:\n{}".format(conf_mat))

Accuracy score: 0.54

Confusion matrix:
[[238   2]
 [228  32]]


In [38]:
# Cross validation
cross_val_score(nb, data, target, cv=10)

array([0.68, 0.66, 0.71, 0.68, 0.72, 0.67, 0.72, 0.67, 0.69, 0.67])

The opposite of the previous iterations: that is, this model can predict negative reviews pretty well, but misses almost entirely on the positive reviews.