In [27]:
import pandas as pd
import string
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model

In [12]:
def remove_punctuation(text):
    table = str.maketrans({key: None for key in string.punctuation})
    return text.translate(table) 

In [3]:
products = pd.read_csv('amazon_baby.csv')

In [13]:
products['review_clean'] = products['review'].apply(remove_punctuation)
products = products.fillna({'review':''})

In [5]:
products = products[products['rating'] != 3]

In [14]:
products['review_clean']

1         it came early and was not disappointed i love ...
2         Very soft and comfortable and warmer than it l...
3         This is a product well worth the purchase  I h...
4         All of my kids have cried nonstop when I tried...
5         When the Binky Fairy came to our house we didn...
6         Lovely book its bound tightly so you may not b...
7         Perfect for new parents We were able to keep t...
8         A friend of mine pinned this product on Pinter...
9         This has been an easy way for my nanny to reco...
10        I love this journal and our nanny uses it ever...
11        This book is perfect  Im a first time new mom ...
12        I originally just gave the nanny a pad of pape...
14        Space for monthly photos info and a lot of use...
15        I bought this calender for myself for my secon...
16        I love this little calender you can keep track...
17        This was the only calender I could find for th...
18        I completed a calendar for my 

In [15]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [16]:
products['sentiment']

1         1
2         1
3         1
4         1
5         1
6         1
7         1
8         1
9         1
10        1
11        1
12        1
14        1
15        1
16        1
17        1
18        1
19        1
20        1
21       -1
22        1
24        1
25        1
26        1
28        1
29        1
30        1
31        1
32        1
33        1
         ..
183499    1
183500    1
183501    1
183502    1
183503   -1
183504    1
183505    1
183506    1
183507    1
183508    1
183509    1
183510    1
183512    1
183513    1
183514    1
183515    1
183517    1
183518    1
183519    1
183520    1
183521    1
183522    1
183523    1
183524    1
183525    1
183526    1
183527    1
183528    1
183529    1
183530    1
Name: sentiment, Length: 166752, dtype: int64

In [23]:
train = pd.read_json('module-2-assignment-train-idx.json')
test = pd.read_json('module-2-assignment-test-idx.json')

In [24]:
train_data = products.iloc[train[0]]
test_data = products.iloc[test[0]]

In [19]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')

In [25]:
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])

In [28]:
sentiment = linear_model.LogisticRegression()

In [29]:
sentiment.fit(train_matrix, train_data['sentiment'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
sample_test_data = test_data[10:13]
print(sample_test_data.columns)

Index(['name', 'review', 'rating', 'review_clean', 'sentiment'], dtype='object')


In [38]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])

In [39]:
sample_score = sentiment.decision_function(sample_test_matrix, output_type='margin')

In [40]:
sentiment.predict(sample_test_matrix)

array([ 1, -1, -1])

In [49]:
def label(score):
    label_list=[]
    for i in range(len(score)):
        if score[i]>0:
            label_list.append(1)
        else:
            label_list.append(-1)
    return label_list

In [48]:
label(sample_score)

[1, -1, -1]

In [50]:
from math import exp
def probability(score):
    prob_list=[]
    for i in range(len(score)):
        prob_list.append(1.0/(1.0+exp(-score[i])))
    return prob_list

In [51]:
probability(sample_score)

[0.9963228663429621, 0.04033583547269647, 2.9716413157159314e-05]

In [54]:
test_matrix=vectorizer.transform(test_data['review_clean'])

In [55]:
test_score = sentiment.decision_function(test_matrix)

In [56]:
problist = probability(test_score)

In [57]:
len(problist)

33336

In [58]:
test_data.loc[:,'probability']=problist

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [59]:
test_data.loc[:,'probability']=problist

In [60]:
test_data.sort_values('probability',ascending=True)[0:20]

Unnamed: 0,name,review,rating,review_clean,sentiment,probability
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1,8.461185e-16
120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,1.603303e-15
77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1,8.10908e-14
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,9.858325e-14
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1,1.915246e-13
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1,3.356468e-13
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,3.27906e-11
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1,3.333717e-11
10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1,9.477737e-11
113995,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITORI purchased this m...,-1,9.628912e-11


In [61]:
test_data.loc[:,'label']=label(test_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [62]:
test_data.loc[:,'accurate']=(test_data['label']==test_data['sentiment'])

In [63]:
np.sum(test_data.loc[:,'accurate'])/len(test_data)

0.9322954163666907

In [64]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [67]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [68]:
simple_sentiment=linear_model.LogisticRegression()
simple_sentiment.fit(train_matrix_word_subset,train_data['sentiment'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [69]:
simple_weights=pd.Series(simple_sentiment.coef_.tolist()[0],index=significant_words)
simple_pos=simple_weights[significant_words][simple_weights[significant_words]>0].index.tolist()

In [73]:
words_vec=vectorizer.vocabulary_.keys()
sentiment_weights=pd.Series(sentiment.coef_.tolist()[0],index=words_vec)

In [74]:
pos=sentiment_weights[significant_words][sentiment_weights[significant_words]>0].index.tolist()
print(simple_pos)
print(pos)
sentiment_weights[significant_words]

['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 'well', 'able', 'car']
['love', 'great', 'old', 'loves', 'well', 'able', 'car', 'less', 'even', 'waste', 'disappointed', 'work', 'product', 'money', 'would', 'return']


love            2.670837e-01
great           6.483661e-02
easy           -5.468800e-03
old             8.179701e-03
little         -3.146887e-01
perfect        -6.860067e-01
loves           1.043761e-02
well            6.160998e-07
able            2.129163e-01
car             5.519557e-02
broke          -7.191992e-01
less            4.175778e-02
even            7.929719e-02
waste           5.762655e-03
disappointed    2.854324e-03
work            1.079507e-05
product         2.334292e-02
money           5.781430e-04
would           2.199556e-01
return          1.987394e-01
dtype: float64

In [75]:
test_data.loc[:,'simple']=simple_sentiment.predict(test_matrix_word_subset)

In [76]:
test_data.loc[:,'simple_accurate']=(test_data['sentiment']==test_data['simple'])

In [77]:
np.sum(test_data.loc[:,'simple_accurate'])/len(test_data)

0.8693604511639069

In [78]:
train_data.loc[:,'pred']=sentiment.predict(train_matrix)
np.sum(train_data['pred']==train_data['sentiment'])

129213

In [79]:
np.sum(train_data['sentiment']==simple_sentiment.predict(train_matrix_word_subset))

115648

In [80]:
np.sum(test_data['sentiment']==1)/len(test_data)

0.8427825773938085