In [1]:
import pandas as pandas
products = pandas.read_csv("amazon_baby.csv")  # read csv to pandas df
products = products.fillna({'review':''})  # fill in N/A's in the review column

In [2]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

products['review_clean'] = products.apply(lambda row: remove_punctuation(row['review']), axis=1)

In [3]:
products = products[products['rating'] != 3]

In [4]:
def get_sentiment(rating):
    return +1 if rating > 3 else -1

products['sentiment'] = products.apply(lambda row : get_sentiment(row['rating']), axis=1)

In [5]:
# get train data
import json
with open("module-2-assignment-train-idx.json") as trainfile:
    train_index = json.load(trainfile)
train_data = products.iloc[train_index,:]

# get test data
with open("module-2-assignment-test-idx.json") as testfile:
    test_index = json.load(testfile)
test_data = products.iloc[test_index,:]

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

# Using LogisticRegression

In [7]:
from sklearn.linear_model import LogisticRegression
alg = LogisticRegression()
sentiment_model = alg.fit(train_matrix, train_data['sentiment'])

In [8]:
coeffs = sentiment_model.coef_[0]
print(coeffs.size)
print((coeffs >= 0).sum())

121712
85911


In [9]:
sample_test_data = test_data[10:13]
print(sample_test_data['review'].iloc[0])
print("\n")
print(sample_test_data['review'].iloc[1])

Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.


Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.


In [10]:
# predicting output using our sentiment model
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)
print([1 if i>=0 else -1 for i in scores])

[  5.60798627  -3.1429946  -10.44043584]
[1, -1, -1]


In [11]:
# prediction using predict function
sentiment_model.predict(sample_test_matrix)

array([ 1, -1, -1])

In [12]:
# predicting probability of output using 
print(sentiment_model.classes_)
print(sentiment_model.predict_proba(sample_test_matrix))

[-1  1]
[[  3.65504085e-03   9.96344959e-01]
 [  9.58631800e-01   4.13681997e-02]
 [  9.99970774e-01   2.92256134e-05]]


In [13]:
import numpy as np
print(1/(1+np.exp(5.60798627)))
print(1/(1+np.exp(-5.60798627)))
print(1/(1+np.exp(scores)))
print(1/(1+np.exp(-1*scores)))
# Last one has lowest probab to classify as +ve review

0.00365504085925
0.996344959141
[ 0.00365504  0.9586318   0.99997077]
[  9.96344959e-01   4.13681997e-02   2.92256134e-05]


# Calculate probability for all test data

In [14]:
test_matrix = vectorizer.transform(test_data['review_clean'])
final_scores = sentiment_model.decision_function(test_matrix)
final_probab_scores = 1/(1+np.exp(-1*final_scores))
final_probab_scores.sort()
print(final_probab_scores[::-1])
# print(sorted_probab[0], sorted_probab[-1])

[  1.00000000e+00   1.00000000e+00   1.00000000e+00 ...,   8.27723545e-14
   1.73290201e-15   8.93702816e-16]


In [15]:
predictions = sentiment_model.predict_proba(test_matrix)
positive_predictions = predictions[:,1]
positive_predictions_copy = positive_predictions
products_name = test_data['name']
predict_details = np.array([positive_predictions, products_name]).T
df = pandas.DataFrame(predict_details)
df.columns = ['predictions', 'name']
df = df.sort_values(by=['predictions'], ascending=[False])
print(df.head(20))

      predictions                                               name
20743           1  Fisher-Price Cradle 'N Swing,  My Little Snuga...
30634           1  Graco FastAction Fold Jogger Click Connect Str...
24899           1         Graco Pack 'n Play Element Playard - Flint
21531           1  Roan Rocco Classic Pram Stroller 2-in-1 with B...
17558           1  Freemie Hands-Free Concealable Breast Pump Col...
25554           1         Diono RadianRXT Convertible Car Seat, Plum
9555            1  Evenflo X Sport Plus Convenience Stroller - Ch...
18112           1  Infantino Wrap and Tie Baby Carrier, Black Blu...
9125            1         P'Kolino Silly Soft Seating in Tias, Green
32782           1      Mamas &amp; Papas 2014 Urbo2 Stroller - Black
26830           1  Baby Jogger City Mini GT Single Stroller, Shad...
24286           1                  Britax 2012 B-Agile Stroller, Red
30535           1  Buttons Cloth Diaper Cover - One Size - 8 Colo...
11923           1       Evenflo 6 

In [16]:
df.tail(20)

Unnamed: 0,predictions,name
5831,1.6146e-09,"Regalo My Cot Portable Bed, Royal Blue"
15062,1.49428e-09,"Thirsties Hemp Inserts 2 Pack, Small 6-18 Lbs"
205,1.10018e-09,Safety 1st Deluxe 4-in-1 Bath Station
28120,8.2667e-10,VTech Communications Safe &amp; Sound Digital ...
27231,7.09893e-10,NUK Cook-n-Blend Baby Food Maker
7310,6.50982e-10,Chicco Cortina KeyFit 30 Travel System in Adve...
31226,6.17932e-10,Belkin WeMo Wi-Fi Baby Monitor for Apple iPhon...
13751,6.05155e-10,"Peg-Perego Tatamia High Chair, White Latte"
10814,4.57446e-10,Ellaroo Mei Tai Baby Carrier - Hershey
1810,4.42926e-10,Cosco Alpha Omega Elite Convertible Car Seat


# Compute accuracy of classifier

In [17]:
predictions = sentiment_model.predict(test_matrix)
accuracy = len(test_data[test_data['sentiment'] == predictions])/float(len(predictions))
print(accuracy)

0.932235421166


# Another classifier with fewer words

In [18]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [19]:
alg2 = LogisticRegression()
simple_model = alg2.fit(train_matrix_word_subset, train_data['sentiment'])

In [20]:
simple_model_coef_table = pandas.DataFrame({'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})

In [21]:
simple_model_coef_table.sort_values(by=['coefficient'], ascending=[False])

Unnamed: 0,coefficient,word
6,1.673074,loves
5,1.509812,perfect
0,1.36369,love
2,1.192538,easy
1,0.944,great
4,0.520186,little
7,0.50376,well
8,0.190909,able
3,0.085513,old
9,0.058855,car


In [22]:
sentiment_model.coef_.flatten()

array([ -1.23889324e+00,   1.59863291e-04,   2.63828080e-02, ...,
         1.17685365e-02,   3.10346626e-03,  -6.36644403e-05])

# Comparing Sentiment Model and Simple Model

In [29]:
# accuracy of sentiment model on train data
predictions = sentiment_model.predict(train_matrix)
accuracy = len(train_data[train_data['sentiment'] == predictions])/float(len(predictions))
print(accuracy)

0.96800233855


In [30]:
# accuracy of simple model on train data
predictions = simple_model.predict(train_matrix_word_subset)
accuracy = len(train_data[train_data['sentiment'] == predictions])/float(len(predictions))
print(accuracy)

0.866822570007


In [31]:
# accuracy of sentiment model on test data
predictions = sentiment_model.predict(test_matrix)
accuracy = len(test_data[test_data['sentiment'] == predictions])/float(len(predictions))
print(accuracy)

0.932235421166


In [32]:
# accuracy of simple model on test data
predictions = simple_model.predict(test_matrix_word_subset)
accuracy = len(test_data[test_data['sentiment'] == predictions])/float(len(predictions))
print(accuracy)

0.869360451164


In [33]:
# accuracy of majority class classifier for test data
accuracy = len(test_data[test_data['sentiment'] == 1])/float(len(predictions))
print(accuracy)

0.842782577394
