# Predicting sentiment from product reviews

# Fire up GraphLab Create

In [1]:
import graphlab

# Read some product review data

Loading reviews for a set of baby products. 

In [2]:
products = graphlab.SFrame('~/Personal/Workshop Machine learning/DataScienceWorkshops/ipythonNotebooks/amazon_baby.gl/')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1487157658.log
INFO:graphlab.cython.cy_server:GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1487157658.log


This non-commercial license of GraphLab Create for academic use is assigned to ahsan.ijaz@ebryx.com and will expire on December 13, 2017.


#Let's explore this data together

Data includes the product name, the review text and the rating of the review. 

In [34]:
products.head()

# Build the word count vector for each review

In [3]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
products['word_count'] = graphlab.text_analytics.count_words(products['review'])
products['word_count']

dtype: dict
Rows: 183531
[{'and': 5, '6': 1, 'stink': 1, 'because': 1, 'ordered': 1, 'just': 1, 'boyfor': 1, 'wipes-ocean': 1, 'wipes,': 1, 'replace': 1, 'not': 1, 'softer': 1, 'are': 3, 'have': 2, 'in': 1, 'need': 1, 'rough': 1, 'ok,': 1, 'issues': 1, 'seemed': 1, 'use': 1, 'blue-12': 1, 'vimse': 1, 'for': 2, 'no': 1, 'that': 1, 'larger,': 1, 'been': 1, 'to': 2, 'someimse': 1, 'quality.': 1, '8': 1, 'flannel': 1, 'worth': 1, 'higher': 1, 'them': 1, 'get': 1, 'keeping.': 1, 'countwhich': 1, 'texture': 1, 'but': 1, 'cloth': 2, 'nicer,': 1, 'they': 1, 'hands': 1, 'fab': 1, 'now': 1, 'had': 2, 'a': 2, 'also': 1, 'about': 1, 'usingthirsties': 1, 'longer': 1, 'i': 2, 'my': 1, 'months': 1, 'wipes': 2, 'these': 1, 'while': 1, 'stripping': 1, 'faces': 1, 'handles.': 1, 'opinion': 1, 'starting': 1, 'pack': 1}, {'and': 3, 'love': 1, 'it': 2, 'highly': 1, 'osocozy': 1, 'bags': 1, 'holder.': 1, 'moist': 1, 'does': 1, 'recommend': 1, 'was': 1, 'wipes': 1, 'it.': 1, 'early': 1, 'disappointed.': 1, '

In [6]:
from functools import partial
def word_countA(word,dict):
    if word in dict:
        return dict[word]
    else:
        return 0


In [7]:
# dic = {"qwe":1,"wae":4,"awesome":2}
#word_countA(dic,"q")
for word in selected_words:
        products[word] = products['word_count'].apply(partial(word_countA,word))



In [12]:
for word in selected_words:
    print word + "=",products[word].sum()


In [8]:
graphlab.canvas.set_target('ipynb')

In [9]:
products['name'].show()

# Examining the reviews for most-sold product:  'Vulli Sophie the Giraffe Teether'

In [10]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']

In [11]:
baby_reviews = products[products['name'] == 'Baby Trend Diaper Champ']


In [12]:
len(giraffe_reviews)

In [13]:
giraffe_reviews['rating'].show(view='Categorical')

# Build a sentiment classifier

In [14]:
products['rating'].show(view='Categorical')

##Define what's a positive and a negative sentiment

We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment.  Reviews with a rating of 4 or higher will be considered positive, while the ones with rating of 2 or lower will have a negative sentiment.   

In [15]:
#ignore all 3* reviews
products = products[products['rating'] != 3]

In [16]:
#positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4

In [17]:
products.head()

## Let's train the sentiment classifier

In [18]:
train_data,test_data = products.random_split(.8, seed=0)
features=selected_words

In [19]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data)
selected_words_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=features,
                                                     validation_set=test_data)

# Evaluate the sentiment model

In [20]:
sentiment_model.evaluate(test_data, metric='roc_curve')


In [21]:
selected_words_model.evaluate(test_data,metric='roc_curve')


In [22]:
selected_words_model.show(view='Evaluation')

In [23]:
sentiment_model.show(view='Evaluation')

In [24]:
selected_words_model["coefficients"].sort('value',ascending=False)


# Applying the learned model to understand sentiment for Giraffe

In [26]:
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')
type(giraffe_reviews)

In [27]:
baby_reviews['predicted_sentiment'] = sentiment_model.predict(baby_reviews, output_type='probability')
type(baby_reviews)


In [31]:
baby_reviews.head()

## Sort the reviews based on the predicted sentiment and explore

In [28]:
baby_reviews = baby_reviews.sort('predicted_sentiment', ascending=False)

In [29]:
selected_words_model.predict(baby_reviews[0:1], output_type='probability')

In [34]:
baby_reviews.head()

## Most positive reviews for the giraffe

In [37]:
baby_reviews[0]['review']
selected_words

In [38]:
baby_reviews[0]['review']

## Show most negative reviews for giraffe

In [30]:
giraffe_reviews[-1]['review']

In [31]:
giraffe_reviews[-2]['review']