In [1]:
import pandas as pd
import numpy as np

In [2]:
products = pd.read_csv('amazon_baby.csv')

In [3]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [4]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

In [5]:
len(products[products['review'].isnull()])

829

In [6]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

In [7]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [9]:
# Ignore neutral review (i.e. rating = 3)
products = products[products['rating'] != 3]
# Create sentiment columns +1 or -1
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [27]:
train_index = pd.read_json('module-2-assignment-train-idx.json')
train_index.columns = ['indexvalue']
id_train = train_index.indexvalue.tolist()

test_index = pd.read_json('module-2-assignment-test-idx.json')
test_index.columns = ['indexvalue']
id_test = test_index.indexvalue.tolist()

In [29]:
len(id_train), len(id_test), len(id_train) + len(id_test), len(products)

(133416, 33336, 166752, 166752)

In [46]:
train_data = products.iloc[id_train]
test_data = products.iloc[id_test]

In [48]:
len(train_data), len(test_data), train_data.shape, test_data.shape

(133416, 33336, (133416, 5), (33336, 5))

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [50]:
train_matrix

<133416x121712 sparse matrix of type '<type 'numpy.int64'>'
	with 7326618 stored elements in Compressed Sparse Row format>

In [53]:
train_matrix.shape, train_data['sentiment'].shape

((133416, 121712), (133416,))

In [54]:
from sklearn.linear_model import LogisticRegression

logistics = LogisticRegression()
sentiment_model = logistics.fit(train_matrix, train_data['sentiment'])

In [66]:
(sentiment_model.coef_ > 0).sum(), len(sentiment_model.coef_[0])

(86605, 121712)

In [70]:
sample_test_data = test_data[10:13]
print sample_test_data

                                                 name  \
59                          Our Baby Girl Memory Book   
71  Wall Decor Removable Decal Sticker - Colorful ...   
91  New Style Trailing Cherry Blossom Tree Decal R...   

                                               review  rating  \
59  Absolutely love it and all of the Scripture in...       5   
71  Would not purchase again or recommend. The dec...       2   
91  Was so excited to get this product for my baby...       1   

                                         review_clean  sentiment  
59  Absolutely love it and all of the Scripture in...          1  
71  Would not purchase again or recommend The deca...         -1  
91  Was so excited to get this product for my baby...         -1  


In [75]:
sample_test_data.iloc[0].review

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [76]:
sample_test_data.iloc[1].review

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

 We will now make a class prediction for the sample_test_data. The sentiment_model should predict +1 if the sentiment is positive and -1 if the sentiment is negative. Recall from the lecture that the score (sometimes called margin) for the logistic regression model is defined as:

SCORE = wTh(xi)

where h(x_i) represents the features for data point i. We will write some code to obtain the scores. For each row, the score (or margin) is a number in the range (-inf, inf). Use a pre-built function in your tool to calculate the score of each data point in sample_test_data. In scikit-learn, you can call the decision_function() function.

In [77]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print scores

[  5.60241961  -3.17404486 -10.42630137]


In [80]:
pred = sentiment_model.predict(sample_test_matrix)

In [81]:
pred

array([ 1, -1, -1])

In [85]:
probabilities = 1/(1+np.exp(-scores))

In [86]:
probabilities

array([  9.96324631e-01,   4.01542288e-02,   2.96416227e-05])

In [88]:
for p in probabilities:
    print "$%.6f" % (p)

$0.996325
$0.040154
$0.000030
