# Predicting sentiment from product reviews

# Fire up GraphLab Create
(See [Getting Started with SFrames](/notebooks/Week%201/Getting%20Started%20with%20SFrames.ipynb) for setup instructions)

In [None]:
import graphlab

In [None]:
# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)

# Read some product review data

Loading reviews for a set of baby products. 

In [None]:
products = graphlab.SFrame('amazon_baby.gl/')

# Let's explore this data together

Data includes the product name, the review text and the rating of the review. 

In [None]:
products.head()

# Build the word count vector for each review

In [None]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])

In [None]:
products.head()

In [None]:
graphlab.canvas.set_target('ipynb')

In [None]:
products['name'].show()

# Examining the reviews for most-sold product:  'Vulli Sophie the Giraffe Teether'

In [None]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']

In [None]:
len(giraffe_reviews)

In [None]:
giraffe_reviews['rating'].show(view='Categorical')

# Build a sentiment classifier

In [None]:
products['rating'].show(view='Categorical')

## Define what's a positive and a negative sentiment

We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment.  Reviews with a rating of 4 or higher will be considered positive, while the ones with rating of 2 or lower will have a negative sentiment.   

In [None]:
# ignore all 3* reviews
products = products[products['rating'] != 3]

In [None]:
# positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4

In [None]:
products.head()

## Let's train the sentiment classifier

In [None]:
train_data,test_data = products.random_split(.8, seed=0)

In [None]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data)

# Evaluate the sentiment model

In [None]:
sentiment_model.evaluate(test_data, metric='roc_curve')

In [None]:
sentiment_model.show(view='Evaluation')

# Applying the learned model to understand sentiment for Giraffe

In [None]:
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')

In [None]:
giraffe_reviews.head()

## Sort the reviews based on the predicted sentiment and explore

In [None]:
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)

In [None]:
giraffe_reviews.head()

## Most positive reviews for the giraffe

In [None]:
giraffe_reviews[0]['review']

In [None]:
giraffe_reviews[1]['review']

## Show most negative reviews for giraffe

In [None]:
giraffe_reviews[-1]['review']

In [None]:
giraffe_reviews[-2]['review']

# Use .apply() to build a new feature with the counts for each of the selected_words

In [132]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

for word in selected_words:
    products[word]= products['word_count'].apply(lambda counts:\
                                               counts.get(word,0))
products

name,review,rating,word_count,sentiment,awesome
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,"{'and': 3, 'love': 1, 'it': 2, 'highly': 1, ...",1,0
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,"{'and': 2, 'quilt': 1, 'it': 1, 'comfortable': ...",1,0
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0,"{'ingenious': 1, 'and': 3, 'love': 2, ...",1,0
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0,"{'and': 2, 'parents!!': 1, 'all': 2, 'puppet.': ...",1,0
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0,"{'and': 2, 'this': 2, 'her': 1, 'help': 2, ...",1,0
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0,"{'shop': 1, 'noble': 1, 'is': 1, 'it': 1, 'as': ...",1,0
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0,"{'and': 2, 'all': 1, 'right': 1, 'when': 1, ...",1,0
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0,"{'and': 1, 'help': 1, 'give': 1, 'is': 1, ' ...",1,0
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0,"{'journal.': 1, 'nanny': 1, 'standarad': 1, ...",1,0
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4.0,"{'all': 1, 'forget': 1, 'just': 1, 'food': 1, ...",1,0

great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate
0,0,0,1,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,2,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,2,0,0,0,0,0,0


In [133]:
products['awesome'].sum()

2002

In [134]:
products['great'].sum()

42420

In [135]:
train_data,test_data = products.random_split(.8, seed=0)

In [136]:
selected_words_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=selected_words,
                                                     validation_set=test_data)

In [137]:
selected_words_model['coefficients']

name,index,class,value,stderr
(intercept),,1,1.36728315229,0.00861805467824
awesome,,1,1.05800888878,0.110865296265
great,,1,0.883937894898,0.0217379527921
fantastic,,1,0.891303090304,0.154532343591
amazing,,1,0.892802422508,0.127989503231
love,,1,1.39989834302,0.0287147460124
horrible,,1,-1.99651800559,0.0973584169028
bad,,1,-0.985827369929,0.0433603009142
terrible,,1,-2.09049998487,0.0967241912229
awful,,1,-1.76469955631,0.134679803365


In [138]:
selected_words_model.evaluate(test_data)

{'accuracy': 0.8431419649291376,
 'auc': 0.6648096413721418,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        0        |  234  |
 |      0       |        1        |  5094 |
 |      1       |        1        | 27846 |
 |      1       |        0        |  130  |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.914242563530107,
 'log_loss': 0.4054747110366022,
 'precision': 0.8453551912568306,
 'recall': 0.9953531598513011,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+-----+-----+-------+------+
 | threshold | fpr | tpr |   p   |  n   |
 +-----------+-----+-----+-------+------+
 |    0.0    | 1.0 | 1.0 | 27976 | 5328 |
 |   1e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   2e-05   | 

In [139]:
sentiment_model.evaluate(test_data)

{'accuracy': 0.916256305548883,
 'auc': 0.9446492867438502,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        1        |  1328 |
 |      0       |        0        |  4000 |
 |      1       |        1        | 26515 |
 |      1       |        0        |  1461 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.9500349343413533,
 'log_loss': 0.261066984324222,
 'precision': 0.9523039902309378,
 'recall': 0.9477766657134686,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+----------------+----------------+-------+------+
 | threshold |      fpr       |      tpr       |   p   |  n   |
 +-----------+----------------+----------------+-------+------+
 |    0.0    |      1.0       |   

In [140]:
diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ']

In [141]:
diaper_champ_reviews['predicted_sentiment1']=selected_words_model.predict(diaper_champ_reviews,output_type='probability')

In [143]:
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment1',ascending=False)

In [144]:
diaper_champ_reviews.head()

name,review,rating,word_count,sentiment,awesome
Baby Trend Diaper Champ,I LOVE LOVE LOVE this product! It is SO much ...,4.0,"{'rating': 1, 'contacted': 1, 'over': ...",1,0
Baby Trend Diaper Champ,I received my Diaper Champ at my baby shower ...,5.0,"{'bags.': 1, ""don't"": 1, 'son.': 1, 'of,': 1, ...",1,0
Baby Trend Diaper Champ,"Love it, love it, love it! This lives up to ...",5.0,"{'instead': 1, 'all': 1, 'already': 1, 'love': 3, ...",1,0
Baby Trend Diaper Champ,Works great - no smells. LOVE that it uses reg ...,5.0,"{'and': 2, 'bags.': 1, 'garbage': 1, 'wastef ...",1,0
Baby Trend Diaper Champ,I love this diaper pale and wouldn't dream of ...,5.0,"{'and': 3, 'love': 1, 'use.': 1, 'is': 2, ' ...",1,0
Baby Trend Diaper Champ,I've worked with kids more than half my life. ...,5.0,"{'and': 4, 'genies': 1, 'all': 1, 'because': 1, ...",1,0
Baby Trend Diaper Champ,I love this diaper pail. It keeps the diapers ...,4.0,"{'and': 1, 'old': 1, 'extra': 1, 'is': 1, ...",1,0
Baby Trend Diaper Champ,"This is absolutely, by far, the best diaper ...",5.0,"{'just': 3, 'money': 1, 'still': 3, 'fine': 1, ...",1,0
Baby Trend Diaper Champ,Love the Diaper Champ. I had planned to get the ...,4.0,"{'reviews,': 1, 'infant': 1, 'bags.': 1, 'just' ...",1,0
Baby Trend Diaper Champ,We had 2 diaper Genie's both given to us as a ...,4.0,"{'hand.': 1, 'both': 1, '(required': 1, 'befo ...",1,0

great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate,predicted_sentiment1
1,0,0,3,0,0,0,0,0,0,0.998423414594
0,0,0,3,0,0,0,0,0,0,0.996192539732
0,0,0,3,0,0,0,0,0,0,0.996192539732
2,0,0,1,0,0,0,0,0,0,0.989387539605
2,0,0,1,0,0,0,0,0,0,0.989387539605
0,0,0,2,0,0,0,0,0,0,0.984739056527
0,0,0,2,0,0,0,0,0,0,0.984739056527
0,0,0,2,0,0,0,0,0,0,0.984739056527
0,0,0,2,0,0,0,0,0,0,0.984739056527
0,0,0,2,0,0,0,0,0,0,0.984739056527


In [145]:
diaper_champ_reviews['predicted_sentiment2']=sentiment_model.predict(diaper_champ_reviews,output_type='probability')

In [147]:
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment2',ascending=False)

In [148]:
diaper_champ_reviews.head()

name,review,rating,word_count,sentiment,awesome
Baby Trend Diaper Champ,Baby Luke can turn a clean diaper to a dirty ...,5.0,"{'all': 1, 'less': 1, ""friend's"": 1, '(which': ...",1,0
Baby Trend Diaper Champ,I LOOOVE this diaper pail! Its the easies ...,5.0,"{'just': 1, 'over': 1, 'rweek': 1, 'sooo': 1, ...",1,0
Baby Trend Diaper Champ,We researched all of the different types of di ...,4.0,"{'all': 2, 'just': 4, ""don't"": 2, 'one,': 1, ...",1,0
Baby Trend Diaper Champ,My baby is now 8 months and the can has been ...,5.0,"{""don't"": 1, 'able': 2, 'over': 1, 'soon': 1, ...",1,0
Baby Trend Diaper Champ,"This is absolutely, by far, the best diaper ...",5.0,"{'just': 3, 'money': 1, 'still': 3, 'fine': 1, ...",1,0
Baby Trend Diaper Champ,Diaper Champ or Diaper Genie? That was my ...,5.0,"{'son': 2, 'all': 1, 'bags.': 1, 'son,': 1, ...",1,0
Baby Trend Diaper Champ,Wow! This is fabulous. It was a toss-up between ...,5.0,"{'and': 4, 'this': 3, 'stink': 1, 'garbage' ...",1,0
Baby Trend Diaper Champ,I originally put this item on my baby registry ...,5.0,"{'lysol': 1, 'all': 2, 'bags.': 1, 'feedback': ...",1,0
Baby Trend Diaper Champ,Two girlfriends and two family members put me ...,5.0,"{'just': 1, '-': 3, 'both': 1, 'results': 1, ...",1,0
Baby Trend Diaper Champ,I am one of those super- critical shoppers who ...,5.0,"{'all': 1, 'humid': 1, 'just': 1, 'less': 1, ...",1,0

great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate,predicted_sentiment1
0,0,0,0,0,0,0,0,0,0,0.796940851291
0,0,0,1,0,0,0,0,0,0,0.940876393428
0,0,0,0,0,1,0,0,0,0,0.5942241719
2,0,0,0,0,1,0,0,0,0,0.895606298305
0,0,0,2,0,0,0,0,0,0,0.984739056527
0,0,0,0,0,0,0,0,0,0,0.796940851291
0,0,0,0,0,0,0,0,0,0,0.796940851291
0,0,0,0,0,0,0,0,0,0,0.796940851291
0,0,0,0,1,0,0,0,0,0,0.347684052736
0,0,0,1,0,0,0,0,0,0,0.940876393428

predicted_sentiment2
0.999999937267
0.999999917406
0.999999899509
0.999999836182
0.999999824745
0.999999759315
0.999999692111
0.999999642488
0.999999604504
0.999999486804
