In [38]:
import json
import os
import codecs
import pandas as pd
import numpy as np

In [6]:
data_files = [f for f in os.listdir("data") if f.endswith(".json")]
data_files

['reviews_Automotive_5.json',
 'reviews_Baby_5.json',
 'reviews_Grocery_and_Gourmet_Food_5.json',
 'reviews_Patio_Lawn_and_Garden_5.json']

In [8]:
datas = []

for f in data_files:
    with codecs.open("data/"+f) as fp:
        for l in fp:
            rev = json.loads(l)

            datas.append(rev)

        
data = pd.DataFrame.from_records(datas)
print(data.shape)
data.head()

(345791, 9)


Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B00002243X,"[4, 4]",5.0,I needed a set of jumper cables for my new car...,"08 17, 2011",A3F73SC1LY51OO,Alan Montgomery,Work Well - Should Have Bought Longer Ones,1313539200
1,B00002243X,"[1, 1]",4.0,"These long cables work fine for my truck, but ...","09 4, 2011",A20S66SKYXULG2,alphonse,Okay long cables,1315094400
2,B00002243X,"[0, 0]",5.0,Can't comment much on these since they have no...,"07 25, 2013",A2I8LFSN2IS5EO,Chris,Looks and feels heavy Duty,1374710400
3,B00002243X,"[19, 19]",5.0,I absolutley love Amazon!!! For the price of ...,"12 21, 2010",A3GT2EWQSO45ZG,DeusEx,Excellent choice for Jumper Cables!!!,1292889600
4,B00002243X,"[0, 0]",5.0,I purchased the 12' feet long cable set and th...,"07 4, 2012",A3ESWJPAVRPWB4,E. Hernandez,"Excellent, High Quality Starter Cables",1341360000


In [9]:
NEGATIVE_REVIEW_MAX = 2
POSITIVE_REVIEW_MIN = 4

def review_sentiment(score):
    if score <= NEGATIVE_REVIEW_MAX:
        return 0
    elif score >= POSITIVE_REVIEW_MIN:
        return 1
    return -1



data["sentiment"] = data["overall"].map(review_sentiment)
data.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,sentiment
0,B00002243X,"[4, 4]",5.0,I needed a set of jumper cables for my new car...,"08 17, 2011",A3F73SC1LY51OO,Alan Montgomery,Work Well - Should Have Bought Longer Ones,1313539200,1
1,B00002243X,"[1, 1]",4.0,"These long cables work fine for my truck, but ...","09 4, 2011",A20S66SKYXULG2,alphonse,Okay long cables,1315094400,1
2,B00002243X,"[0, 0]",5.0,Can't comment much on these since they have no...,"07 25, 2013",A2I8LFSN2IS5EO,Chris,Looks and feels heavy Duty,1374710400,1
3,B00002243X,"[19, 19]",5.0,I absolutley love Amazon!!! For the price of ...,"12 21, 2010",A3GT2EWQSO45ZG,DeusEx,Excellent choice for Jumper Cables!!!,1292889600,1
4,B00002243X,"[0, 0]",5.0,I purchased the 12' feet long cable set and th...,"07 4, 2012",A3ESWJPAVRPWB4,E. Hernandez,"Excellent, High Quality Starter Cables",1341360000,1


In [10]:
# remove middling reviews
data = data[data.sentiment != -1]
data.shape

(307933, 10)

In [15]:
# do some pre-processing of the text, lower case and stem everything
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [16]:
def clean_review_text(rev):
    """
        Lemmatize and lowercase everything, also remove punctuation
    """
    rev = rev.lower()
    tokens = rev.split()
    lemmatized_tokens = []
    for t in tokens:
        lemmatized_tokens.append(lemmatizer.lemmatize(t.strip(".,!'")))
        
    return ' '.join(lemmatized_tokens)

data["clean_text"] = data.reviewText.map(clean_review_text)

In [17]:
data.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,sentiment,clean_text
0,B00002243X,"[4, 4]",5.0,I needed a set of jumper cables for my new car...,"08 17, 2011",A3F73SC1LY51OO,Alan Montgomery,Work Well - Should Have Bought Longer Ones,1313539200,1,i needed a set of jumper cable for my new car ...
1,B00002243X,"[1, 1]",4.0,"These long cables work fine for my truck, but ...","09 4, 2011",A20S66SKYXULG2,alphonse,Okay long cables,1315094400,1,these long cable work fine for my truck but th...
2,B00002243X,"[0, 0]",5.0,Can't comment much on these since they have no...,"07 25, 2013",A2I8LFSN2IS5EO,Chris,Looks and feels heavy Duty,1374710400,1,can't comment much on these since they have no...
3,B00002243X,"[19, 19]",5.0,I absolutley love Amazon!!! For the price of ...,"12 21, 2010",A3GT2EWQSO45ZG,DeusEx,Excellent choice for Jumper Cables!!!,1292889600,1,i absolutley love amazon for the price of a se...
4,B00002243X,"[0, 0]",5.0,I purchased the 12' feet long cable set and th...,"07 4, 2012",A3ESWJPAVRPWB4,E. Hernandez,"Excellent, High Quality Starter Cables",1341360000,1,i purchased the 12 foot long cable set and the...


In [26]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=50000)

In [27]:
bow = vectorizer.fit_transform(data["clean_text"])

In [43]:
train_test_split = 250000

train_data = bow[:train_test_split]
train_labels = data.sentiment[:train_test_split]
test_data = bow[train_test_split:]
test_labels = data.sentiment[train_test_split:]

In [32]:
bow.shape

(307933, 50000)

In [33]:
from sklearn.linear_model import LogisticRegression

In [35]:
lr_model = LogisticRegression()
lr_model.fit(train_data, train_labels)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [39]:
preds = lr_model.predict(test_data)
preds

array([1, 1, 0, ..., 1, 1, 1])

In [46]:
accuracy = np.mean(preds == test_labels.values)

In [47]:
accuracy

0.91956225294737026