#Predicting sentiment from product reviews

#Fire up GraphLab Create

In [None]:
import graphlab

#Read some product review data

Loading reviews for a set of baby products. 

In [None]:
products = graphlab.SFrame('amazon_baby.gl/')

#Let's explore this data together

Data includes the product name, the review text and the rating of the review. 

In [None]:
products.head()

#Build the word count vector for each review

In [None]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])

In [None]:
products.head()

In [None]:
graphlab.canvas.set_target('ipynb')

In [None]:
products['name'].show()

#Examining the reviews for most-sold product:  'Vulli Sophie the Giraffe Teether'

In [None]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']

In [None]:
len(giraffe_reviews)

In [None]:
giraffe_reviews['rating'].show(view='Categorical')

#Build a sentiment classifier

In [None]:
products['rating'].show(view='Categorical')

##Define what's a positive and a negative sentiment

We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment.  Reviews with a rating of 4 or higher will be considered positive, while the ones with rating of 2 or lower will have a negative sentiment.   

In [None]:
#ignore all 3* reviews
products = products[products['rating'] != 3]

In [None]:
#positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4

In [None]:
products.head()

##Let's train the sentiment classifier

In [None]:
train_data,test_data = products.random_split(.8, seed=0)

In [None]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data)

#Evaluate the sentiment model

In [None]:
sentiment_model.evaluate(test_data, metric='roc_curve')

In [None]:
sentiment_model.show(view='Evaluation')

# Applying the learned model to understand sentiment for Giraffe

In [None]:
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')

In [None]:
giraffe_reviews.head()

##Sort the reviews based on the predicted sentiment and explore

In [None]:
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)

In [None]:
giraffe_reviews.head()

##Most positive reviews for the giraffe

In [None]:
giraffe_reviews[0]['review']

In [None]:
giraffe_reviews[1]['review']

##Show most negative reviews for giraffe

In [None]:
giraffe_reviews[-1]['review']

In [None]:
giraffe_reviews[-2]['review']

In [None]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

In [None]:
def awesome_count(dict):
    if 'awesome' in dict:
        return dict['awesome']
    return 0

In [None]:
products['awesome'] = products['word_count'].apply(awesome_count)

In [None]:
from functools import partial

def word_count(word, dict):
    if word in dict:
        return dict[word]
    return 0

In [None]:
for word in selected_words:
    products[word] = products['word_count'].apply(partial(word_count, word))

In [None]:
products

In [None]:
count = []

for word in selected_words:
    x = products[word].sum()
    print "For word %s = " % word + str(x)
    d = (word, x)
    count.append(d)

In [None]:
from operator import itemgetter
sorted(count, key=itemgetter(1), reverse = True)

In [None]:
train_data,test_data = products.random_split(.8, seed=0)

In [None]:
selected_words_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=selected_words,
                                                     validation_set=test_data)

In [None]:
selected_words_model['coefficients'].sort('value').print_rows(num_rows=12)

In [None]:
selected_words_model.evaluate(test_data)

In [None]:
sentiment_model.evaluate(test_data)

In [None]:
diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ']

In [None]:
diaper_champ_reviews['sentiment_model'] = sentiment_model.predict(diaper_champ_reviews, output_type='probability')

In [None]:
diaper_champ_reviews = diaper_champ_reviews.sort('sentiment_model', ascending=False)

In [None]:
diaper_champ_reviews['selected_words_model'] = selected_words_model.predict(diaper_champ_reviews, output_type='probability')

In [None]:
diaper_champ_reviews['review', 'sentiment_model', 'selected_words_model'].head(1)

In [None]:
diaper_champ_reviews.head(1)