# Amazon Reviews sentiment classifier

## Model: Logistic Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [4]:
# import the dataset
df = pd.read_csv("amazon_baby.csv")
df.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


#### Preprocessing

In [5]:
import string

def remove_punctuation(text):
    remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
    return text.translate(remove_punct_map)

In [6]:
df = df.fillna({'review':''})
df['review_clean'] = df['review'].apply(remove_punctuation)
df = df[df['rating'] != 3]
df['sentiment'] = df['rating'].apply(lambda rating : +1 if rating > 3 else -1)
df.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


### Create train/test sets

In [9]:
import json

with open("module-2-assignment-train-idx.json") as json_file:
    train_idx = json.load(json_file)
with open("module-2-assignment-test-idx.json") as json_file:
    test_idx = json.load(json_file)

train_data = df.iloc[train_idx]
test_data = df.iloc[test_idx]
print("Train len = {}".format(len(train_data)))
print("Test len = {}".format(len(test_data)))

Train len = 133416
Test len = 33336


### Countvectorizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [15]:
# number of examples * total number of columns (distinct words)
print(train_matrix.shape)
print(test_matrix.shape)


(133416, 121712)
(33336, 121712)


### Training Logistic Regression model

In [32]:
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [33]:
# Number of weights >= 0
print(sentiment_model.intercept_)
print(np.sum(sentiment_model.coef_ >= 0))

[1.37846807]
85876


### Sample test

In [34]:
sample_test_data = test_data[10:13]
print(sample_test_data)

                                                 name  \
59                          Our Baby Girl Memory Book   
71  Wall Decor Removable Decal Sticker - Colorful ...   
91  New Style Trailing Cherry Blossom Tree Decal R...   

                                               review  rating  \
59  Absolutely love it and all of the Scripture in...       5   
71  Would not purchase again or recommend. The dec...       2   
91  Was so excited to get this product for my baby...       1   

                                         review_clean  sentiment  
59  Absolutely love it and all of the Scripture in...          1  
71  Would not purchase again or recommend The deca...         -1  
91  Was so excited to get this product for my baby...         -1  


In [35]:
print(sample_test_data.iloc[0]['review'])
print(sample_test_data.iloc[1]['review'])

Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.
Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.


In [47]:
### Make prediction on sample

sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print("Scores : {}".format(scores))
y_pred = scores > 0
probs = 1 / (1 + np.exp(-scores))
print("Predictions : {}".format(y_pred))
print("Probabilities : {}".format(probs))

Scores : [  5.61494955  -3.13452062 -10.41199092]
Predictions : [ True False False]
Probabilities : [9.96370230e-01 4.17055602e-02 3.00688444e-05]


In [48]:
# alternatively
print(sentiment_model.predict(sample_test_matrix))
print(sentiment_model.predict_proba(sample_test_matrix))

[ 1 -1 -1]
[[3.62977024e-03 9.96370230e-01]
 [9.58294440e-01 4.17055602e-02]
 [9.99969931e-01 3.00688444e-05]]


##### Most positive 20 reviews

In [49]:
all_scores = sentiment_model.decision_function(test_matrix)
top20 = sorted(range(len(all_scores)), key=lambda i: all_scores[i])[-20:]
test_data.iloc[top20]

Unnamed: 0,name,review,rating,review_clean,sentiment
147996,"Baby Jogger City Mini GT Double Stroller, Shad...","We are well pleased with this stroller, and I ...",4,We are well pleased with this stroller and I w...,1
182089,Summer Infant Wide View Digital Color Video Mo...,I love this baby monitor. I can compare this ...,5,I love this baby monitor I can compare this o...,1
22586,"Britax Decathlon Convertible Car Seat, Tiffany",I researched a few different seats to put in o...,4,I researched a few different seats to put in o...,1
165593,Ikea 36 Pcs Kalas Kids Plastic BPA Free Flatwa...,For the price this set is unbelievable- and tr...,5,For the price this set is unbelievable and tru...,1
114796,"Fisher-Price Cradle 'N Swing, My Little Snuga...",My husband and I cannot state enough how much ...,5,My husband and I cannot state enough how much ...,1
66059,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",It's always fun to write a review on those pro...,5,Its always fun to write a review on those prod...,1
147949,"Baby Jogger City Mini GT Single Stroller, Shad...","Amazing, Love, Love, Love it !!! All 5 STARS a...",5,Amazing Love Love Love it All 5 STARS all the...,1
97325,Freemie Hands-Free Concealable Breast Pump Col...,I absolutely love this product. I work as a C...,5,I absolutely love this product I work as a Cu...,1
168697,Graco FastAction Fold Jogger Click Connect Str...,Graco's FastAction Jogging Stroller definitely...,5,Gracos FastAction Jogging Stroller definitely ...,1
80155,"Simple Wishes Hands-Free Breastpump Bra, Pink,...","I just tried this hands free breastpump bra, a...",5,I just tried this hands free breastpump bra an...,1


#### Most negative 20 reviews

In [50]:
bottom20 = sorted(range(len(all_scores)), key=lambda i: all_scores[i])[:20]
test_data.iloc[bottom20]

Unnamed: 0,name,review,rating,review_clean,sentiment
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1
120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1
77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1
113995,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITORI purchased this m...,-1
10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1


#### Accuracy

In [57]:
y_pred = all_scores > 0
accuracy = np.sum(np.equal(y_pred, test_data['sentiment'])) / y_pred.shape[0]
accuracy

0.818454523638109

In [59]:
# alternatively
sentiment_model.score(test_matrix,test_data['sentiment'])

0.9320254379649628

## Training with important words only

In [60]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [61]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

#### Coefficient table

In [63]:
simple_model_coef_table = pd.DataFrame({'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})
simple_model_coef_table = simple_model_coef_table.sort_values('coefficient')
print(np.sum(simple_model_coef_table['coefficient']>0))
simple_model_coef_table

10


Unnamed: 0,word,coefficient
14,disappointed,-2.348298
19,return,-2.109331
13,waste,-2.033699
10,broke,-1.651576
17,money,-0.898031
15,work,-0.621169
12,even,-0.51138
18,would,-0.362167
16,product,-0.320556
11,less,-0.209563


##### Accuracy

In [64]:
print(test_matrix_word_subset.shape)

(33336, 20)


In [66]:
all_scores_simple = simple_model.decision_function(test_matrix_word_subset)
y_pred_simple = all_scores_simple > 0
accuracy_simple = np.sum(np.equal(y_pred_simple, test_data['sentiment'])) / y_pred.shape[0]
accuracy_simple

0.8287736981041517

In [67]:
simple_model.score(test_matrix_word_subset, test_data['sentiment'])

0.8693604511639069