In [116]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn import preprocessing
%matplotlib inline
sns.set_style('white')

In [87]:
data = pd.read_json('reviews_Office_Products_5.json', lines=True)

In [88]:
data.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B00000JBLH,"[3, 4]",5,"I bought my first HP12C in about 1984 or so, a...","09 3, 2004",A32T2H8150OJLU,ARH,"A solid performer, and long time friend",1094169600
1,B00000JBLH,"[7, 9]",5,WHY THIS BELATED REVIEW? I feel very obliged t...,"12 15, 2007",A3MAFS04ZABRGO,"Let it Be ""Alan""","Price of GOLD is up, so don't bury the golden ...",1197676800
2,B00000JBLH,"[3, 3]",2,I have an HP 48GX that has been kicking for mo...,"01 1, 2011",A1F1A0QQP2XVH5,Mark B,"Good functionality, but not durable like old HPs",1293840000
3,B00000JBLH,"[7, 8]",5,I've started doing more finance stuff recently...,"04 19, 2006",A49R5DBXXQDE5,R. D Johnson,One of the last of an almost extinct species,1145404800
4,B00000JBLH,"[0, 0]",5,For simple calculations and discounted cash fl...,"08 4, 2013",A2XRMQA6PJ5ZJ8,Roger J. Buffington,Still the best,1375574400


In [89]:
#Cut data down to only include the useable data
data_short = data.loc[:,['overall','reviewText','reviewTime','summary', 'unixReviewTime']]

In [90]:
data_short.head()

Unnamed: 0,overall,reviewText,reviewTime,summary,unixReviewTime
0,5,"I bought my first HP12C in about 1984 or so, a...","09 3, 2004","A solid performer, and long time friend",1094169600
1,5,WHY THIS BELATED REVIEW? I feel very obliged t...,"12 15, 2007","Price of GOLD is up, so don't bury the golden ...",1197676800
2,2,I have an HP 48GX that has been kicking for mo...,"01 1, 2011","Good functionality, but not durable like old HPs",1293840000
3,5,I've started doing more finance stuff recently...,"04 19, 2006",One of the last of an almost extinct species,1145404800
4,5,For simple calculations and discounted cash fl...,"08 4, 2013",Still the best,1375574400


In [91]:
#reviewTime is a bit useless as is
#Separate out months
month = []
for item in data_short.reviewTime:
    month.append(int(item[0:2]))
month_a = pd.DataFrame(np.array(month).reshape(53258,1))
month_a.columns=['reviewMonth']

#Separate out days
day = []
for item in data_short.reviewTime:
    l = len(item)
    if l == 10:
        day.append(item[3:4])
    if l == 11:
        day.append(item[3:5])
day_a = pd.DataFrame(np.array(day).reshape(53258,1))
day_a.columns=['reviewDay']

#Separate out year
year = []
for item in data_short.reviewTime:
    l = len(item)
    if l == 10:
        year.append(item[6:])
    if l == 11:
        year.append(item[7:])
        
year_a = pd.DataFrame(np.array(year).reshape(53258,1))
year_a.columns=['reviewYear']

In [92]:
data_short = pd.concat([data_short, day_a], axis=1)
data_short = pd.concat([data_short, month_a], axis=1)
data_short = pd.concat([data_short, year_a], axis=1)
data_short = data_short.drop(columns=['reviewTime'])

In [93]:
data_short.head()

Unnamed: 0,overall,reviewText,summary,unixReviewTime,reviewDay,reviewMonth,reviewYear
0,5,"I bought my first HP12C in about 1984 or so, a...","A solid performer, and long time friend",1094169600,3,9,2004
1,5,WHY THIS BELATED REVIEW? I feel very obliged t...,"Price of GOLD is up, so don't bury the golden ...",1197676800,15,12,2007
2,2,I have an HP 48GX that has been kicking for mo...,"Good functionality, but not durable like old HPs",1293840000,1,1,2011
3,5,I've started doing more finance stuff recently...,One of the last of an almost extinct species,1145404800,19,4,2006
4,5,For simple calculations and discounted cash fl...,Still the best,1375574400,4,8,2013


In [94]:
#Feature lists
negation = ["n't","not"]
quality = ['best','worst','excellent','terrible','well', 'good','bad','great','okay'
          ,'perfect']
price = ['price','expensive','cheap', 'low','high']
supply_specific = ['broken','explode','heat','error','durable']

features = [negation, quality, price, supply_specific]

In [95]:
#Replace all the punctuation so it is not an issue
data_short['reviewText'] = data_short['reviewText'].str.replace(',','')
data_short['reviewText'] = data_short['reviewText'].str.replace('!','')
data_short['reviewText'] = data_short['reviewText'].str.replace('.','')
data_short['reviewText'] = data_short['reviewText'].str.replace('?','')

In [96]:
#Function to easily create features of each word
def add_columns_nospace(list, data):
    for word in list:
        data[str(word)] = (data.reviewText.str.contains(str(word))).astype(int)

In [97]:
#Iterate through to create features
for item in features:
    add_columns_nospace(item, data_short)

In [101]:
#Turn overall into binary >= 3 good => 1, <3 bad => 0
data_short.overall = (data_short.overall >= 3).astype(int)

In [106]:
X = data_short.drop(columns=['overall','reviewText','summary'])
y = data_short.overall

In [115]:
#Run Bernoulli Classifer, same score as Decision Tree and Random Forest
bnb = GaussianNB()
bnb.fit(X,y)

y_pred = bnb.predict(X)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    X.shape[0],
    (y != y_pred).sum()
))


Number of mislabeled points out of a total 53258 points : 2856


In [120]:
print((y == y_pred).sum()/X.shape[0])

0.946374253633257


In [121]:
#Run Decision Tree Classifier
decision_tree = tree.DecisionTreeClassifier(criterion='entropy',
                                max_depth=4,
                                max_features=1)
decision_tree.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=1, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [122]:
cross_val_score(decision_tree, X, y)

array([0.94637526, 0.94637526, 0.94625958])

In [128]:
rfc = ensemble.RandomForestClassifier(n_estimators=4, criterion='entropy', max_depth=4)
rfc.fit(X,y)
cross_val_score(rfc, X, y)

array([0.94637526, 0.9462626 , 0.94637224])