# Amazon Product reviews sentiment analysis using sklearn.

In [1]:
# load the pandas libraries
import pandas as pd

In [2]:
# load data
products = pd.read_csv('amazon_baby.csv')

In [3]:
# view data
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [4]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

In [5]:
# function to remove punctuation marks from the reviews. - Data Cleaning step
def remove_punctuation(text):
    import string
    if type(text) is str: # needed because data frame will return the index of rows
        return text.translate(None, string.punctuation)
    else:
        return

In [6]:
# apply this function to reviews and add new column with the clean data
products['review_clean'] = products['review'].apply(remove_punctuation)

In [7]:
# Extract sentiments that are not equal to 3 as thy represent neutral sentiment
products = products[products['rating']!=3]

In [8]:
# Assign sentiments -> +1 or -1 based on the rating
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [9]:
# split data into train and test with 80/20 ratio
from sklearn.cross_validation import train_test_split
train_data, test_data = train_test_split(products, test_size=0.2)

In [10]:
# Generate bag-of-words features by counting occurances of each wors in a review
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [11]:
# Generate a logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
# Quiz question - How many weights are >= 0? - check the other one
(model.coef_ >= 0).sum()

85151

In [13]:
sample_test_data = test_data[13:16]
sample_test_data = sample_test_data.reset_index()
print sample_test_data

    index                                               name  \
0   39824  Graco Baby SnugGlider Infant Car Seat Swing Frame   
1  131632  The First Years MiSwivel Feeding Seat, Dot to Dot   
2   92029  BooginHead PaciGrip Pacifier Holder Green Dot ...   

                                              review  rating  \
0  My lil one is currently 12 days old and spends...       5   
1  I love this high chair and so does my 5 month ...       4   
2  Item is accurately described and works wonderf...       5   

                                        review_clean  sentiment  
0  My lil one is currently 12 days old and spends...          1  
1  I love this high chair and so does my 5 month ...          1  
2  Item is accurately described and works wonderf...          1  


In [14]:
sample_test_data.at[0, 'review']

"My lil one is currently 12 days old and spends most of her day in the seat attached to this swing. I have yet to change the batteries and I just used the cheapies from the dollar tree! I am greatly surprised! I love the fact that there is nothing above the swing to bump the baby's head on, and its easier to get the baby out."

In [15]:
sample_test_data.at[1, 'review']

"I love this high chair and so does my 5 month old son, but since the straps are white in color they get dirty easily. I have been using this chair for 2 months now and even though I diligently cleaned the straps after every use they are now a yellowish color. I have tried everything to make them that brand new white color again, but to no avail. But the chair still works great and I still love it, even it's yellow straps."

In [16]:
# Predict the sentiment for sample test data
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = model.decision_function(sample_test_matrix)
print scores
my_pred_senti = map(lambda score: 1 if score > 0 else -1, scores)
print my_pred_senti
print model.predict(test_matrix[13:16,:])      # compare with the predict function

[ 3.32793116  6.81335996  2.64182689]
[1, 1, 1]
[1 1 1]


In [17]:
print model.predict(test_matrix[13:16,:])

[1 1 1]


In [18]:
test_matrix

<33351x121659 sparse matrix of type '<type 'numpy.int64'>'
	with 1808342 stored elements in Compressed Sparse Row format>

In [23]:
# Predict probabilities
from numpy import exp
my_pred_senti_prob = map(lambda score: float(1)/(1 + exp(-score)), scores)
print my_pred_senti_prob
print model.predict_proba(test_matrix[13:16,:]) [:,1]     # compare with the predict function

[0.96537468246681579, 0.99890221235170973, 0.93350545511487726]
[ 0.96537468  0.99890221  0.93350546]


**Note:** Need to load the pre-split train and test data to answer questions for quiz

In [20]:
train_data_idx = pd.read_json('module-2-assignment-train-idx.json', typ='series')
test_data_idx = pd.read_json('module-2-assignment-test-idx.json', typ='series')
# ^ in the lines above, very important to read in as Series.
train_data = products.iloc[train_data_idx]
test_data = products.iloc[test_data_idx]

In [24]:
# Generate bag-of-words features by counting occurances of each wors in a review
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [25]:
# Generate a logistic regression model
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
# Quiz question - How many weights are >= 0? - this one is correct
(sentiment_model.coef_ >= 0).sum()

85948

In [27]:
sample_test_data = test_data[10:13]
sample_test_data = sample_test_data.reset_index()
print sample_test_data

   index                                               name  \
0     59                          Our Baby Girl Memory Book   
1     71  Wall Decor Removable Decal Sticker - Colorful ...   
2     91  New Style Trailing Cherry Blossom Tree Decal R...   

                                              review  rating  \
0  Absolutely love it and all of the Scripture in...       5   
1  Would not purchase again or recommend. The dec...       2   
2  Was so excited to get this product for my baby...       1   

                                        review_clean  sentiment  
0  Absolutely love it and all of the Scripture in...          1  
1  Would not purchase again or recommend The deca...         -1  
2  Was so excited to get this product for my baby...         -1  


In [28]:
sample_test_data.at[0, 'review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [29]:
sample_test_data.at[1, 'review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [30]:
# Predict the sentiment for sample test data
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print scores
my_pred_senti = map(lambda score: 1 if score > 0 else -1, scores)
print my_pred_senti
print sentiment_model.predict(test_matrix[10:13,:])      # compare with the predict function

[  5.5961479   -3.16126785 -10.43314035]
[1, -1, -1]
[ 1 -1 -1]


In [31]:
# Predict probabilities
from numpy import exp
my_pred_senti_prob = map(lambda score: float(1)/(1 + exp(-score)), scores)
print my_pred_senti_prob
print sentiment_model.predict_proba(test_matrix[10:13,:]) [:,0]     # compare with the predict function

[0.9963015932441307, 0.040649582004875257, 2.9439601739096726e-05]
[ 0.00369841  0.95935042  0.99997056]


In [None]:
# Quiz question:
# Which of the following products are represented in the 20 most negative reviews?
predicted_prob = sentiment_model.predict_proba(test_matrix)
test_data['predicted_probability'] = pd.Series(predicted_prob[:,1], index=test_data.index)
test_data = test_data.sort_values(by='predicted_probability', ascending=True)
test_data[:20]

In [None]:
# Quiz question:
# Which of the following products are represented in the 20 most positive reviews?
test_data = test_data.sort_values(by='predicted_probability', ascending=False)
test_data[:20]

In [None]:
def calculate_accuracy(data_frame):
    correct_count = (data_frame['predicted_sentiment'] == data_frame['sentiment']).sum()
    accuracy = float(correct_count)/len(data_frame)
    return accuracy

In [None]:
predicted_class = sentiment_model.predict(test_matrix)
test_data['predicted_sentiment'] = pd.Series(predicted_class, index=test_data.index)

In [None]:
# Quiz question
# What is the accuracy of the sentiment_model on the test_data?
print calculate_accuracy(test_data)

In [None]:
# List of significant words to train a simplere model
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [None]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [None]:
# Train the logistic regression model based on the significant words.
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

In [None]:
# Quiz Question: 
# How many of the 20 coefficients (corresponding to the 20 significant_words) 
# are positive for the simple_model?
simple_model_coef_table = pd.DataFrame({'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})
simple_model_coef_table = simple_model_coef_table.sort_values(by='coefficient', 
                                                              ascending=False)
(simple_model_coef_table['coefficient'] > 0).sum()

In [None]:
# Quiz question:
# Which model (sentiment_model or simple_model) has higher accuracy on the TRAINING set?
train_data['predicted_sentiment'] = pd.Series(sentiment_model.predict(train_matrix), 
                                           index=train_data.index)
print calculate_accuracy(train_data)
train_data['predicted_sentiment'] = pd.Series(simple_model.predict(train_matrix_word_subset), 
                                           index=train_data.index)
print calculate_accuracy(train_data)

In [None]:
# Quiz question:
# Which model (sentiment_model or simple_model) has higher accuracy on the TEST set?
test_data['predicted_sentiment'] = pd.Series(sentiment_model.predict(test_matrix), 
                                           index=test_data.index)
print calculate_accuracy(test_data)
test_data['predicted_sentiment'] = pd.Series(simple_model.predict(test_matrix_word_subset), 
                                           index=test_data.index)
print calculate_accuracy(test_data)

In [None]:
# Quiz question:
# Enter the accuracy of the majority class classifier model on the test_data.
num_positive  = (test_data['sentiment'] == +1).sum()
num_negative = (test_data['sentiment'] == -1).sum()
print num_positive
print num_negative
print max(num_positive, num_negative)/float(len(test_data))

In [None]:
test_data.at[94560, 'name']