In [3]:
import pandas as pd
import numpy as np

In [4]:
products = pd.read_csv('amazon_baby.csv')

In [5]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [6]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

In [7]:
len(products[products['review'].isnull()])

829

In [8]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

In [9]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [10]:
# Ignore neutral review (i.e. rating = 3)
products = products[products['rating'] != 3]
# Create sentiment columns +1 or -1
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [12]:
train_index = pd.read_json('module-2-assignment-train-idx.json')
train_index.columns = ['indexvalue']
id_train = train_index.indexvalue.tolist()

test_index = pd.read_json('module-2-assignment-test-idx.json')
test_index.columns = ['indexvalue']
id_test = test_index.indexvalue.tolist()

In [13]:
len(id_train), len(id_test), len(id_train) + len(id_test), len(products)

(133416, 33336, 166752, 166752)

In [14]:
train_data = products.iloc[id_train]
test_data = products.iloc[id_test]

In [15]:
len(train_data), len(test_data), train_data.shape, test_data.shape

(133416, 33336, (133416, 5), (33336, 5))

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [139]:
train_matrix

<133416x121712 sparse matrix of type '<type 'numpy.int64'>'
	with 7326618 stored elements in Compressed Sparse Row format>

In [140]:
train_matrix.shape, train_data['sentiment'].shape

((133416, 121712), (133416,))

In [141]:
from sklearn.linear_model import LogisticRegression

logistics = LogisticRegression()
sentiment_model = logistics.fit(train_matrix, train_data['sentiment'])

In [142]:
(sentiment_model.coef_ > 0).sum(), len(sentiment_model.coef_[0])

(87059, 121712)

In [143]:
train_preds = sentiment_model.predict(train_matrix)

In [144]:
sample_test_data = test_data[10:13]
print sample_test_data

                                                 name  \
59                          Our Baby Girl Memory Book   
71  Wall Decor Removable Decal Sticker - Colorful ...   
91  New Style Trailing Cherry Blossom Tree Decal R...   

                                               review  rating  \
59  Absolutely love it and all of the Scripture in...       5   
71  Would not purchase again or recommend. The dec...       2   
91  Was so excited to get this product for my baby...       1   

                                         review_clean  sentiment  
59  Absolutely love it and all of the Scripture in...          1  
71  Would not purchase again or recommend The deca...         -1  
91  Was so excited to get this product for my baby...         -1  


In [145]:
sample_test_data.iloc[0].review

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [146]:
sample_test_data.iloc[1].review

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

 We will now make a class prediction for the sample_test_data. The sentiment_model should predict +1 if the sentiment is positive and -1 if the sentiment is negative. Recall from the lecture that the score (sometimes called margin) for the logistic regression model is defined as:

SCORE = wTh(xi)

where h(x_i) represents the features for data point i. We will write some code to obtain the scores. For each row, the score (or margin) is a number in the range (-inf, inf). Use a pre-built function in your tool to calculate the score of each data point in sample_test_data. In scikit-learn, you can call the decision_function() function.

In [147]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print scores

[  5.60150644  -3.17110494 -10.42378277]


In [148]:
pred = sentiment_model.predict(sample_test_matrix)

In [149]:
pred

array([ 1, -1, -1])

In [150]:
probabilities = 1/(1+np.exp(-scores))

In [151]:
probabilities

array([  9.96321286e-01,   4.02676921e-02,   2.97163701e-05])

In [152]:
for p in probabilities:
    print "$%.6f" % (p)

$0.996321
$0.040268
$0.000030


In [153]:
sentiment_model.predict_proba(sample_test_matrix)

array([[  3.67871441e-03,   9.96321286e-01],
       [  9.59732308e-01,   4.02676921e-02],
       [  9.99970284e-01,   2.97163701e-05]])

In [154]:
#Predict all test data
test_matrix = vectorizer.transform(test_data['review_clean'])
all_scores = sentiment_model.decision_function(test_matrix)
all_prob = 1/(1+np.exp(-all_scores))

In [155]:
sort_prob = np.argsort(all_prob)

In [156]:
sort_prob

array([ 2931, 21700, 13939, ...,  9125, 21531,  4140])

In [157]:
all_prob[np.array(sort_prob)].tolist()[-20:]

[0.9999999999999996,
 0.9999999999999998,
 0.9999999999999998,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]

In [158]:
# Top 20 reviews with highest probabilities of being positive sentiment outcome
test_data.iloc[sort_prob.tolist()[-20:]]

Unnamed: 0,name,review,rating,review_clean,sentiment
172351,Phil &amp; Teds Navigator Buggy Golden Kiwi Fr...,I'm pretty happy with this stroller. I use it ...,4,Im pretty happy with this stroller I use it wi...,1
182089,Summer Infant Wide View Digital Color Video Mo...,I love this baby monitor. I can compare this ...,5,I love this baby monitor I can compare this o...,1
165593,Ikea 36 Pcs Kalas Kids Plastic BPA Free Flatwa...,For the price this set is unbelievable- and tr...,5,For the price this set is unbelievable and tru...,1
87017,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1
133651,"Britax 2012 B-Agile Stroller, Red",[I got this stroller for my daughter prior to ...,4,I got this stroller for my daughter prior to t...,1
97325,Freemie Hands-Free Concealable Breast Pump Col...,I absolutely love this product. I work as a C...,5,I absolutely love this product I work as a Cu...,1
140816,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1
137034,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n Play la...,1
80155,"Simple Wishes Hands-Free Breastpump Bra, Pink,...","I just tried this hands free breastpump bra, a...",5,I just tried this hands free breastpump bra an...,1
66059,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",It's always fun to write a review on those pro...,5,Its always fun to write a review on those prod...,1


In [159]:
all_prob[np.array(sort_prob)].tolist()[:20]

[8.474422799019441e-16,
 1.594857094120814e-15,
 8.141166559006492e-14,
 9.830461809782829e-14,
 1.9417930718124784e-13,
 3.324654597634677e-13,
 3.272252524683938e-11,
 3.329505751100866e-11,
 9.494597418462951e-11,
 9.585600327512885e-11,
 4.353072105605993e-10,
 4.385850311973944e-10,
 5.587867899903182e-10,
 5.691483535653738e-10,
 5.800051446702902e-10,
 6.175407371779014e-10,
 8.036933510898024e-10,
 1.0768172542087048e-09,
 1.5918585913483078e-09,
 1.6425933972523393e-09]

In [160]:
# Top 20 reviews with lowest probabilities of being positive sentiment outcome
test_data.iloc[sort_prob.tolist()[:20]]

Unnamed: 0,name,review,rating,review_clean,sentiment
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1
120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1
77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1
10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1
113995,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITORI purchased this m...,-1


In [161]:
prob_preds = sentiment_model.predict_proba(test_matrix)

In [162]:
# Test predict_proba function
for i in range(20):
    print prob_preds[i], all_prob[i]

[ 0.21548939  0.78451061] 0.784510607616
[  7.63755400e-07   9.99999236e-01] 0.999999236245
[ 0.06680174  0.93319826] 0.9331982568
[  2.05313706e-05   9.99979469e-01] 0.999979468629
[ 0.01978281  0.98021719] 0.980217186589
[  4.11603931e-05   9.99958840e-01] 0.999958839607
[ 0.00124386  0.99875614] 0.998756135642
[ 0.19881561  0.80118439] 0.801184387327
[ 0.00155328  0.99844672] 0.998446724343
[ 0.00262609  0.99737391] 0.997373911548
[ 0.00367871  0.99632129] 0.996321285592
[ 0.95973231  0.04026769] 0.0402676920594
[  9.99970284e-01   2.97163701e-05] 2.97163700568e-05
[ 0.00421181  0.99578819] 0.995788189328
[ 0.0070009  0.9929991] 0.992999095734
[ 0.02292531  0.97707469] 0.977074689965
[ 0.00727957  0.99272043] 0.992720432886
[ 0.05998065  0.94001935] 0.940019354868
[ 0.96011081  0.03988919] 0.0398891859162
[ 0.00102191  0.99897809] 0.998978088583


In [163]:
# Calculate accuracy
output_preds = sentiment_model.predict(test_matrix)

In [164]:
np.sum(output_preds == test_data['sentiment']), len(test_data)

(31079, 33336)

In [165]:
accuracy = float(np.sum(output_preds == test_data['sentiment'])) / float(len(test_data))
print accuracy

0.932295416367


In [166]:
# Fewer words, less number of features
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [167]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [168]:
simple_sentiment_model = logistics.fit(train_matrix_word_subset, train_data['sentiment'])

In [169]:
simple_model_coef_table = zip(significant_words, simple_sentiment_model.coef_.flatten().tolist())

In [170]:
simple_model_coef_table

[('love', 1.3636897593113122),
 ('great', 0.9439995905715208),
 ('easy', 1.1925382734893262),
 ('old', 0.08551277946306653),
 ('little', 0.520185762718075),
 ('perfect', 1.5098124766921264),
 ('loves', 1.673073892593298),
 ('well', 0.5037604577674004),
 ('able', 0.19090857206453168),
 ('car', 0.05885467115257516),
 ('broke', -1.6515763449653662),
 ('less', -0.20956286453466788),
 ('even', -0.5113796317987785),
 ('waste', -2.0336986139403703),
 ('disappointed', -2.3482982195026),
 ('work', -0.6211687736414516),
 ('product', -0.32055623673463246),
 ('money', -0.8980307377147477),
 ('would', -0.36216674227363427),
 ('return', -2.10933109031867)]

In [171]:
sorted(simple_model_coef_table, key=lambda x: x[1])

[('disappointed', -2.3482982195026),
 ('return', -2.10933109031867),
 ('waste', -2.0336986139403703),
 ('broke', -1.6515763449653662),
 ('money', -0.8980307377147477),
 ('work', -0.6211687736414516),
 ('even', -0.5113796317987785),
 ('would', -0.36216674227363427),
 ('product', -0.32055623673463246),
 ('less', -0.20956286453466788),
 ('car', 0.05885467115257516),
 ('old', 0.08551277946306653),
 ('able', 0.19090857206453168),
 ('well', 0.5037604577674004),
 ('little', 0.520185762718075),
 ('great', 0.9439995905715208),
 ('easy', 1.1925382734893262),
 ('love', 1.3636897593113122),
 ('perfect', 1.5098124766921264),
 ('loves', 1.673073892593298)]

In [172]:
from sklearn.metrics import accuracy_score

In [173]:
print accuracy_score(train_data['sentiment'], train_preds)

0.968504527193


In [174]:
#sentiment_model_accuracy = accuracy_score(test_data['sentiment'], sentiment_model.predict(test_matrix))
sentiment_model_accuracy = accuracy_score(test_data['sentiment'], output_preds)
print sentiment_model_accuracy

0.932295416367


In [175]:
simple_test_preds = simple_sentiment_model.predict(test_matrix_word_subset)
print accuracy_score(test_data['sentiment'], simple_test_preds)

0.869360451164


In [176]:
simple_train_preds = simple_sentiment_model.predict(train_matrix_word_subset)
print accuracy_score(train_data['sentiment'], simple_train_preds)

0.866822570007


In [179]:
num_positive  = (train_data['sentiment'] == +1).sum()
num_negative = (train_data['sentiment'] == -1).sum()
print num_positive, float(num_positive)/float(len(train_data))
print num_negative, float(num_negative)/float(len(train_data))

112164 0.840708760568
21252 0.159291239432
