# Notebook Setup

In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.externals import joblib
import math

In [2]:
products = pd.read_csv('/Users/Andy/Documents/Coursera/Machine Learning Specialization - Classification /amazon_baby.csv')

In [3]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


# Cleaning

In [4]:
import string
s = "string. With. Punctuation?" # Sample string 
translator = s.maketrans('','',string.punctuation)
s.translate(translator)

'string With Punctuation'

In [5]:
def remove_punctuation(text):
    translator = text.maketrans('','',string.punctuation)
    return text.translate(translator)

In [6]:
products.review.fillna('',inplace=True)

In [7]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [8]:
products.iloc[0]['review_clean']

'These flannel wipes are OK but in my opinion not worth keeping  I also ordered someImse Vimse Cloth WipesOcean Blue12 countwhich are larger had a nicer softer texture and just seemed higher quality  I use cloth wipes for hands and faces and have been usingThirsties 6 Pack Fab Wipes Boyfor about 8 months now and need to replace them because they are starting to get rough and have had stink issues for a while that stripping no longer handles'

In [9]:
products = products[products['rating'] != 3]

In [10]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

# Test/Train Split - <code>train_data</code> and <code>test_data</code> 

In [11]:
test_indic = open('/Users/Andy/Documents/Coursera/Machine Learning Specialization - Classification /module-2-assignment-test-idx.json','r')
test_indic=list(test_indic)[0]
test_indic=test_indic.split(',')
test_indic[0]=' 8'
test_indic[-1]=' 166751'

In [12]:
test_indic[0:10]

[' 8', ' 9', ' 14', ' 18', ' 24', ' 31', ' 32', ' 36', ' 38', ' 50']

In [13]:
train_indic = open('/Users/Andy/Documents/Coursera/Machine Learning Specialization - Classification /module-2-assignment-train-idx.json')
train_indic = list(train_indic)[0]
train_indic = train_indic.split(',')
train_indic[0] = ' 0'
train_indic[-1]=' 166750'

In [14]:
train_indic[0:10]

[' 0', ' 1', ' 2', ' 3', ' 4', ' 5', ' 6', ' 7', ' 10', ' 11']

In [15]:
test_list=[]
for line in test_indic:
    t = [int(x.strip()) for x in line.split(',')]
    test_list.append(t[0])

In [16]:
test_list[0:10]

[8, 9, 14, 18, 24, 31, 32, 36, 38, 50]

In [17]:
test_data = products.iloc[test_list]
test_data.head()

Unnamed: 0,name,review,rating,review_clean,sentiment
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1
16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1
20,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1
28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,One of babys first and favorite books and it i...,1


In [18]:
train_list=[]
for line in train_indic:
    t = [int(x.strip()) for x in line.split(',')]
    train_list.append(t[0])

In [19]:
train_data = products.iloc[train_list]
train_data.head(10)

Unnamed: 0,name,review,rating,review_clean,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1
6,A Tale of Baby's Days with Peter Rabbit,"Lovely book, it's bound tightly so you may not...",4,Lovely book its bound tightly so you may not b...,1
7,"Baby Tracker&reg; - Daily Childcare Journal, S...",Perfect for new parents. We were able to keep ...,5,Perfect for new parents We were able to keep t...,1
8,"Baby Tracker&reg; - Daily Childcare Journal, S...",A friend of mine pinned this product on Pinter...,5,A friend of mine pinned this product on Pinter...,1
11,"Baby Tracker&reg; - Daily Childcare Journal, S...",This book is perfect! I'm a first time new mo...,5,This book is perfect Im a first time new mom ...,1
12,"Baby Tracker&reg; - Daily Childcare Journal, S...",I originally just gave the nanny a pad of pape...,4,I originally just gave the nanny a pad of pape...,1


In [20]:
test_data.sentiment.value_counts()

 1    28095
-1     5241
Name: sentiment, dtype: int64

In [21]:
train_data.sentiment.value_counts()

 1    112164
-1     21252
Name: sentiment, dtype: int64

# Build the word count vector for each review

In [22]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [23]:
train_matrix.shape

(133416, 121712)

In [44]:
lr = LogisticRegression()

In [45]:
sentiment_model = lr.fit(X=train_matrix,y=train_data['sentiment'])

In [46]:
sentiment_model.coef_.shape

(1, 121712)

In [47]:
pos = sentiment_model.coef_[np.where(sentiment_model.coef_>=0)].shape[0]
neg = sentiment_model.coef_[np.where(sentiment_model.coef_<0)].shape[0]
print('# of positive coefficients:',pos)
print('# of negative coefficients:',neg)

# of positive coefficients: 85811
# of negative coefficients: 35901


In [48]:
# from sklearn.externals import joblib
joblib.dump(sentiment_model,'sentiment_model.pkl')
# sentiment_model=joblib.load('sentiment_model.pkl')

['sentiment_model.pkl']

# Making predictions with logistic regression

In [49]:
sample_test_data = test_data[10:13]
print(sample_test_data)

                                                 name  \
59                          Our Baby Girl Memory Book   
71  Wall Decor Removable Decal Sticker - Colorful ...   
91  New Style Trailing Cherry Blossom Tree Decal R...   

                                               review  rating  \
59  Absolutely love it and all of the Scripture in...       5   
71  Would not purchase again or recommend. The dec...       2   
91  Was so excited to get this product for my baby...       1   

                                         review_clean  sentiment  
59  Absolutely love it and all of the Scripture in...          1  
71  Would not purchase again or recommend The deca...         -1  
91  Was so excited to get this product for my baby...         -1  


In [50]:
sample_test_data.iloc[0]['review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [51]:
sample_test_data.iloc[1]['review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [52]:
sample_test_data.iloc[2]['review']

"Was so excited to get this product for my baby girls bedroom!  When I got it the back is NOT STICKY at all!  Every time I walked into the bedroom I was picking up pieces off of the floor!  Very very frustrating!  Ended up having to super glue it to the wall...very disappointing.  I wouldn't waste the time or money on it."

In [53]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])

In [54]:
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

[  5.60351902  -3.13388392 -10.40529952]


## Predicting Sentiment

In [55]:
def score_function(score):
    if score>0:
        label=1
    elif score<=0:
        label=-1
    return label

In [56]:
labels=[score_function(x) for x in scores]

In [57]:
labels

[1, -1, -1]

## Probability Predictions

In [58]:
def calculate_proba(score):
    proba = 1/(1+math.exp(-score))
    return proba

In [59]:
probas = [calculate_proba(x) for x in scores]

In [60]:
probas

[0.9963286547025019, 0.041731014173507615, 3.0270715756816633e-05]

In [61]:
print('The review with the lowers proba is:',np.argmin(probas)+1)

The review with the lowers proba is: 3


## Find the most positive (and negative) review

In [62]:
sentiment_model.predict_proba(sample_test_matrix)[:,1]

array([  9.96328655e-01,   4.17310142e-02,   3.02707158e-05])

In [63]:
test_matrix.shape

(33336, 121712)

In [64]:
testProbaPred = pd.Series(sentiment_model.predict_proba(test_matrix)[:,1])

In [65]:
TestProbas = test_data.assign(proba=testProbaPred.values)

In [66]:
TestProbas.head()

Unnamed: 0,name,review,rating,review_clean,sentiment,proba
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1,0.781078
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1,0.999999
16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1,0.934123
20,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1,0.999978
28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,One of babys first and favorite books and it i...,1,0.98021


In [92]:
TestProbas.sort_values(by='proba',ascending=False)[['name','proba']].head(20)

Unnamed: 0,name,proba
50315,"P'Kolino Silly Soft Seating in Tias, Green",1.0
180646,Mamas &amp; Papas 2014 Urbo2 Stroller - Black,1.0
52631,Evenflo X Sport Plus Convenience Stroller - Ch...,1.0
137034,Graco Pack 'n Play Element Playard - Flint,1.0
140816,"Diono RadianRXT Convertible Car Seat, Plum",1.0
66059,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",1.0
87017,Baby Einstein Around The World Discovery Center,1.0
100166,"Infantino Wrap and Tie Baby Carrier, Black Blu...",1.0
168081,Buttons Cloth Diaper Cover - One Size - 8 Colo...,1.0
133651,"Britax 2012 B-Agile Stroller, Red",1.0


In [93]:
TestProbas.sort_values(by='proba',ascending=True)[['name','proba']].head(20)

Unnamed: 0,name,proba
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,0.0
120209,Levana Safe N'See Digital Video Baby Monitor w...,0.0
77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,0.0
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,0.0
155287,VTech Communications Safe &amp; Sounds Full Co...,0.0
94560,The First Years True Choice P400 Premium Digit...,0.0
53207,Safety 1st High-Def Digital Monitor,0.0
81332,Cloth Diaper Sprayer--styles may vary,0.0
113995,Motorola Digital Video Baby Monitor with Room ...,0.0
10677,Philips AVENT Newborn Starter Set,0.0


In [97]:
testPred = sentiment_model.predict(test_matrix)

In [98]:
testPredDF = test_data.assign(classPred=testPred)
testPredDF.head()

Unnamed: 0,name,review,rating,review_clean,sentiment,classPred
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1,1
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1,1
16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1,1
20,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1,1
28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,One of babys first and favorite books and it i...,1,1


In [99]:
print(testPredDF.sentiment.value_counts())
print(testPredDF.classPred.value_counts())

 1    28095
-1     5241
Name: sentiment, dtype: int64
 1    28743
-1     4593
Name: classPred, dtype: int64


In [101]:
testPredDF.loc[:,'correctPred']= np.where(testPredDF['sentiment']==testPredDF['classPred'],'correct','incorrect')

In [104]:
testPredDF.head()

Unnamed: 0,name,review,rating,review_clean,sentiment,classPred,correctPred
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1,1,correct
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1,1,correct
16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1,1,correct
20,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1,1,correct
28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,One of babys first and favorite books and it i...,1,1,correct


In [112]:
accuracy = (testPredDF[testPredDF.correctPred=='correct'].correctPred.count())/(testPredDF.correctPred.count())
print('the accuracy is : {0}%'.format(accuracy.round(5)*100))

the accuracy is : 93.227%


## Learn another classifier with fewer words


In [113]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [114]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [115]:
simple_model = lr.fit(X=train_matrix_word_subset,y=train_data['sentiment'])

In [127]:
simple_model.coef_

array([[ 1.36368976,  0.94399959,  1.19253827,  0.08551278,  0.52018576,
         1.50981248,  1.67307389,  0.50376046,  0.19090857,  0.05885467,
        -1.65157634, -0.20956286, -0.51137963, -2.03369861, -2.34829822,
        -0.62116877, -0.32055624, -0.89803074, -0.36216674, -2.10933109]])

In [118]:
simplePos = simple_model.coef_[np.where(simple_model.coef_>=0)].shape[0]
simpleNeg = simple_model.coef_[np.where(simple_model.coef_<0)].shape[0]
print('# of positive coefficients:',simplePos)
print('# of negative coefficients:',simpleNeg)

# of positive coefficients: 10
# of negative coefficients: 10


In [121]:
simple_model_coef_table = pd.DataFrame({'word':significant_words,
                                       'coef':simple_model.coef_.flatten()})

In [124]:
list(simple_model_coef_table[simple_model_coef_table['coef']>0].word)

['love',
 'great',
 'easy',
 'old',
 'little',
 'perfect',
 'loves',
 'well',
 'able',
 'car']

0
