# Predicting sentiment from product reviews

# Fire up Libraries

In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics

from sklearn.linear_model import LogisticRegression

# Read some product review data

## Loading reviews for a set of baby products. 

In [None]:
products = pd.read_csv('amazon_baby.csv')

# Let's explore this data together

## Data includes the product name, the review text and the rating of the review. 

## Given that we have some lines with empty registers, we drop them

In [None]:
products.shape

In [None]:
products.head()

In [None]:
products.isnull().sum()

In [None]:
products = products.dropna(axis=0) # axis = 0 means we drop lines

In [None]:
products.isnull().sum()

In [None]:
products.shape # we lost a few examples

# Examining the reviews for most-sold product:  'Vulli Sophie the Giraffe Teether'

In [None]:
c = Counter(products['name'].values)

In [None]:
print 'Most common:'
for name, count in c.most_common(10):
    print '%s: %7d' % (name, count)

# What this Sophie the Giraffe?

# <img src="sophie.jpg">

In [None]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']

In [None]:
len(giraffe_reviews)

In [None]:
plt.hist(giraffe_reviews['rating'], bins=np.arange(10)-0.5)
plt.xlabel("Value")
plt.xlim([-1,6])
plt.ylabel("Frequency")
plt.grid('on')
plt.show()

# Build a sentiment classifier

In [None]:
plt.hist(products['rating'], bins=np.arange(10)-0.5)
plt.xlabel("Value")
plt.xlim([-1,6])
plt.ylabel("Frequency")
plt.grid('on')
plt.show()

## Define what's a positive and a negative sentiment

We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment.  Reviews with a rating of 4 or higher will be considered positive, while the ones with rating of 2 or lower will have a negative sentiment.   

In [None]:
#ignore all 3* reviews
products = products[products['rating'] != 3]

In [None]:
#positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4

In [None]:
products.head()

## Let's train the sentiment classifier

In [None]:
train_data = products.sample(frac=0.8, random_state=200)
test_data  = products.drop(train_data.index)
print(train_data.shape, test_data.shape)

# Build the word count vector for each review

In [None]:
count_vectorizer = CountVectorizer()
x_train = count_vectorizer.fit_transform(train_data['review'].values)

# Build the y vector

In [None]:
y_train = train_data['sentiment'].values

# Training the classifier

In [None]:
classifier = LogisticRegression()
classifier.fit(x_train, y_train)

# Evaluate the sentiment model

In [None]:
x_test = count_vectorizer.transform(test_data['review'].values)
y_test = test_data['sentiment'].values

# predict class labels for the test set
predicted = classifier.predict(x_test)

In [None]:
predicted

In [None]:
# generate class probabilities
probs = classifier.predict_proba(x_test)
print probs

In [None]:
# generate evaluation metrics
print metrics.accuracy_score(y_test, predicted)
print metrics.roc_auc_score(y_test, probs[:, 1])

# Applying the learned model to understand sentiment for Giraffe

In [None]:
sophie_probs = classifier.predict_proba(count_vectorizer.transform(giraffe_reviews['review']))

In [None]:
sophie_probs.shape

In [None]:
giraffe_reviews['predicted_sentiment'] = sophie_probs.max(axis=1)

In [None]:
giraffe_reviews.head()

## Sort the reviews based on the predicted sentiment and explore

In [None]:
giraffe_reviews = giraffe_reviews.sort_values('predicted_sentiment', ascending=False)

## Most positive reviews for the giraffe

In [None]:
giraffe_reviews.head()

In [None]:
print giraffe_reviews[giraffe_reviews.index==34892]['review'].values

In [None]:
print giraffe_reviews[giraffe_reviews.index==34434]['review'].values

## Show most negative reviews for giraffe

In [None]:
giraffe_reviews.tail()

In [None]:
print giraffe_reviews[giraffe_reviews.index==34576]['review'].values

In [None]:
print giraffe_reviews[giraffe_reviews.index==35072]['review'].values

# Quizz questions

### Answer 1

### Answer 2

### Answer 3

### Answer 4