In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [59]:
# Load amazon review data
amazon = pd.read_csv("amazon_cells_labelled.txt", delimiter="\t", header=None, names=["review_", "score_"])
print("Rows: {}\nColumns: {}".format(amazon.shape[0], amazon.shape[1]))
amazon.head()

Rows: 1000
Columns: 2


Unnamed: 0,review_,score_
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


## First attempt with some brainstormed keywords

In [60]:
keywords = ["good", "excellent", "great", "impressed", "fantastic", "incredible", "awesome", "love",
           "hate", "poor", "awful", "terrible", "shoddy", "dislike", "slow"]
for word in keywords:
    amazon[str(word)] = amazon["review_"].str.contains(" " + str(word) + " ", case=False)
amazon.head()

Unnamed: 0,review_,score_,good,excellent,great,impressed,fantastic,incredible,awesome,love,hate,poor,awful,terrible,shoddy,dislike,slow
0,So there is no way for me to plug it in here i...,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",1,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [61]:
amazon["score_"] = amazon["score_"] == 1
amazon.head()

Unnamed: 0,review_,score_,good,excellent,great,impressed,fantastic,incredible,awesome,love,hate,poor,awful,terrible,shoddy,dislike,slow
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [63]:
data = amazon[keywords]
target = amazon["score_"]
nb = BernoulliNB()
nb.fit(data, target)
y_pred = nb.predict(data)
print("Number of mislabeled points out of a total {} points: {}".format(data.shape[0], (target != y_pred).sum()))
amazon_attempt_one_sucess_rate = 100 * (1 - (target != y_pred).sum() / data.shape[0])
print("Sucess rate: {:.2f}%".format(amazon_attempt_one_sucess_rate))

Number of mislabeled points out of a total 1000 points: 416
Sucess rate: 58.40%


## Second attempt using sentiment analysis words found online
I found a [list of positve sentiment keywords](http://ptrckprry.com/course/ssd/data/positive-words.txt), a [list of negative sentiment keywords](http://ptrckprry.com/course/ssd/data/negative-words.txt), and I combined them into one csv file. This combined list becomes the model's features.

In [64]:
# Load amazon review data again, refreshing hte dataframe
amazon = pd.read_csv("amazon_cells_labelled.txt", delimiter="\t", header=None, names=["review_", "score_"])
print("Rows: {}\nColumns: {}".format(amazon.shape[0], amazon.shape[1]))
amazon.head()

Rows: 1000
Columns: 2


Unnamed: 0,review_,score_
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [65]:
online_keywords = pd.read_csv("all_words.csv", header=None)

for word in online_keywords[0]:
    amazon[str(word)] = amazon["review_"].str.contains(' ' + str(word) + ' ', case=False)
amazon.head()

Unnamed: 0,review_,score_,a+,abound,abounds,abundance,abundant,accessable,accessible,acclaim,...,wrongly,wrought,yawn,zap,zapped,zaps,zealot,zealous,zealously,zombie
0,So there is no way for me to plug it in here i...,0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [66]:
amazon["score_"] = amazon["score_"] == 1
amazon.head()

Unnamed: 0,review_,score_,a+,abound,abounds,abundance,abundant,accessable,accessible,acclaim,...,wrongly,wrought,yawn,zap,zapped,zaps,zealot,zealous,zealously,zombie
0,So there is no way for me to plug it in here i...,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [67]:
data = amazon[online_keywords[0]]
target = amazon["score_"]
nb = BernoulliNB()
nb.fit(data, target)
y_pred = nb.predict(data)
print("Number of mislabeled points out of a total {} points: {}".format(data.shape[0], (target != y_pred).sum()))
amazon_attempt_two_sucess_rate = 100 * (1 - (target != y_pred).sum() / data.shape[0])
print("Sucess rate: {:.2f}%".format(amazon_attempt_two_sucess_rate))

Number of mislabeled points out of a total 1000 points: 250
Sucess rate: 75.00%


### Try the model on the Yelp review dataset

In [69]:
yelp = pd.read_csv("yelp_labelled.txt", delimiter="\t", header=None, names=["review_", "score_"])
print("Rows: {}\nColumns: {}".format(amazon.shape[0], amazon.shape[1]))
yelp.head()

Rows: 1000
Columns: 6783


Unnamed: 0,review_,score_
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [70]:
for word in online_keywords[0]:
    yelp[str(word)] = yelp["review_"].str.contains(' ' + str(word) + ' ', case=False)
yelp.head()

Unnamed: 0,review_,score_,a+,abound,abounds,abundance,abundant,accessable,accessible,acclaim,...,wrongly,wrought,yawn,zap,zapped,zaps,zealot,zealous,zealously,zombie
0,Wow... Loved this place.,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Crust is not good.,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Not tasty and the texture was just nasty.,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Stopped by during the late May bank holiday of...,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The selection on the menu was great and so wer...,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [37]:
yelp["score_"] = (yelp["score_"] == 1)
yelp.head()

Unnamed: 0,review_,score_,a+,abound,abounds,abundance,abundant,accessable,accessible,acclaim,...,wrongly,wrought,yawn,zap,zapped,zaps,zealot,zealous,zealously,zombie
0,Wow... Loved this place.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Crust is not good.,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Not tasty and the texture was just nasty.,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Stopped by during the late May bank holiday of...,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The selection on the menu was great and so wer...,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [71]:
data = yelp[online_keywords[0]]
target = yelp["score_"]
y_pred = nb.predict(data)
print("Number of mislabeled points out of a total {} points: {}".format(data.shape[0], (target != y_pred).sum()))
yelp_attempt_two_sucess_rate = 100 * (1 - (target != y_pred).sum() / data.shape[0])
print("Sucess rate: {:.2f}%".format(yelp_attempt_two_sucess_rate))

Number of mislabeled points out of a total 1000 points: 359
Sucess rate: 64.10%


## Third attempt custom features extracted with sklearn CountVectorizer

In [72]:
# Load amazon review data yet again, refreshing hte dataframe
amazon = pd.read_csv("amazon_cells_labelled.txt", delimiter="\t", header=None, names=["review_", "score_"])
print("Rows: {}\nColumns: {}".format(amazon.shape[0], amazon.shape[1]))
amazon.head()

Rows: 1000
Columns: 2


Unnamed: 0,review_,score_
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [73]:
vect = CountVectorizer()
amazon_dtm = vect.fit_transform(amazon["review_"])
custom_feature_list = list(vect.get_feature_names())

In [74]:
for word in custom_feature_list:
    amazon[str(word)] = amazon["review_"].str.contains(' ' + str(word) + ' ', case=False)
amazon.head()

Unnamed: 0,review_,score_,10,100,11,12,13,15,15g,18,...,wrongly,year,years,yell,yes,yet,you,your,z500a,zero
0,So there is no way for me to plug it in here i...,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [75]:
amazon["score_"] = amazon["score_"] == 1
amazon.head()

Unnamed: 0,review_,score_,10,100,11,12,13,15,15g,18,...,wrongly,year,years,yell,yes,yet,you,your,z500a,zero
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [76]:
data = amazon[custom_feature_list]
target = amazon["score_"]
nb = BernoulliNB()
nb.fit(data, target)
y_pred = nb.predict(data)
print("Number of mislabeled points out of a total {} points: {}".format(data.shape[0], (target != y_pred).sum()))
amazon_attempt_three_sucess_rate = 100 * (1 - (target != y_pred).sum() / data.shape[0])
print("Sucess rate: {:.2f}%".format(amazon_attempt_three_sucess_rate))

Number of mislabeled points out of a total 1000 points: 128
Sucess rate: 87.20%


### Try the custome feature model on the Yelp review dataset

In [77]:
yelp = pd.read_csv("yelp_labelled.txt", delimiter="\t", header=None, names=["review_", "score_"])
print("Rows: {}\nColumns: {}".format(amazon.shape[0], amazon.shape[1]))
yelp.head()

Rows: 1000
Columns: 1849


Unnamed: 0,review_,score_
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [78]:
for word in custom_feature_list:
    yelp[str(word)] = yelp["review_"].str.contains(' ' + str(word) + ' ', case=False)
yelp.head()

Unnamed: 0,review_,score_,10,100,11,12,13,15,15g,18,...,wrongly,year,years,yell,yes,yet,you,your,z500a,zero
0,Wow... Loved this place.,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Crust is not good.,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Not tasty and the texture was just nasty.,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Stopped by during the late May bank holiday of...,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The selection on the menu was great and so wer...,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [79]:
yelp["score_"] = (yelp["score_"] == 1)
yelp.head()

Unnamed: 0,review_,score_,10,100,11,12,13,15,15g,18,...,wrongly,year,years,yell,yes,yet,you,your,z500a,zero
0,Wow... Loved this place.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Crust is not good.,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Not tasty and the texture was just nasty.,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Stopped by during the late May bank holiday of...,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The selection on the menu was great and so wer...,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [81]:
data = yelp[custom_feature_list]
target = yelp["score_"]
y_pred = nb.predict(data)
print("Number of mislabeled points out of a total {} points: {}".format(data.shape[0], (target != y_pred).sum()))
yelp_attempt_three_sucess_rate = 100 * (1 - (target != y_pred).sum() / data.shape[0])
print("Sucess rate: {:.2f}%".format(yelp_attempt_three_sucess_rate))

Number of mislabeled points out of a total 1000 points: 324
Sucess rate: 67.60%


## Summary

In [99]:
print("Brainstormed key words:")
print("Amazon first attempt sucess rate:\t{:.1f}%".format(amazon_attempt_one_sucess_rate))
print("Amazon second attempt sucess rate:\t{:.1f}%".format(amazon_attempt_two_sucess_rate))
print("Difference between attempts:\t\t{:.1f}%".format(amazon_attempt_two_sucess_rate - amazon_attempt_one_sucess_rate))

Brainstormed key words:
Amazon first attempt sucess rate:	58.4%
Amazon second attempt sucess rate:	75.0%
Difference between attempts:		16.6%


Note: There wasn't an attempt to use this model on the yelp data

In [100]:
print("Found online key words:")
print("Amazon second attempt sucess rate:\t{:.1f}%".format(amazon_attempt_two_sucess_rate))
print("Yelp second attempt sucess rate:\t{:.1f}%".format(yelp_attempt_two_sucess_rate))
print("Difference between attempts:\t\t{:.1f}%".format(amazon_attempt_two_sucess_rate - yelp_attempt_two_sucess_rate))

Found online key words:
Amazon second attempt sucess rate:	75.0%
Yelp second attempt sucess rate:	64.1%
Difference between attempts:		10.9%


In [101]:
print("Custom features using sklearn CountVectorizer:")
print("Amazon third attempt sucess rate:\t{:.1f}%".format(amazon_attempt_three_sucess_rate))
print("Yelp third attempt sucess rate:\t\t{:.1f}%".format(yelp_attempt_three_sucess_rate))
print("Difference between attempts:\t\t{:.1f}%".format(amazon_attempt_three_sucess_rate - yelp_attempt_three_sucess_rate))

Custom features using sklearn CountVectorizer:
Amazon third attempt sucess rate:	87.2%
Yelp third attempt sucess rate:		67.6%
Difference between attempts:		19.6%


Both the online and keyword and custom feature keywords were, unsurprisingly, an improvement on the brainstormed keywords. The custom features did best overall on the Amazon dataset (87.2%), and on the Yelp dataset (67.6%). However, the dropoff between from Amazon to Yelp on the online keyword model was considerably less than the custom feature keyword dropoff (10.9% difference to a 19.6% difference). This suggests that the online keywords model is a much more general solution.