In [1]:
import numpy as np
import pandas as pd
import string
import math
import json
import pydotplus
from IPython.display import Image 

In [20]:
products = pd.read_csv('amazon_baby.csv')

## Perform Cleaning

In [21]:
def remove_punctuation(text):
    translator = text.maketrans('','',string.punctuation)
    return text.translate(translator)

In [22]:
products.review.fillna('',inplace=True)

In [23]:
products['review_clean'] = products['review'].apply(remove_punctuation)

## Extract Sentiments

In [24]:
products = products[products['rating'] != 3]

In [25]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

## Split into training and test sets

In [26]:
train_index = pd.read_json('module-9-assignment-train-idx.json')
train_index = list(train_index[0])

In [27]:
test_index = pd.read_json('module-9-assignment-test-idx.json')
test_index = list(test_index[0])

In [28]:
train_data = products.iloc[train_index]
test_data = products.iloc[test_index]

In [29]:
print('Training set: %d data points' % len(train_data))
print('Test set: %d data points' % len(test_data))

Training set: 133416 data points
Test set: 33336 data points


## Build the word count vector for each review

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

## Train a sentiment classifier with logistic regression

In [31]:
from sklearn import linear_model

In [32]:
logistic = linear_model.LogisticRegression()

In [33]:
model = logistic.fit(train_matrix, train_data['sentiment'])

## Model Evaluation

In [37]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true=test_data['sentiment'].as_matrix(), y_pred=model.predict(test_matrix))
print("Test Accuracy: %s" % accuracy)

Test Accuracy: 0.932265418766


## Baseline: Majority class prediction

In [39]:
baseline = len(test_data[test_data['sentiment'] == 1])/len(test_data)
print("Baseline accuracy (majority class classifier): %s" % baseline)

Baseline accuracy (majority class classifier): 0.8427825773938085


## Confusion Matix

In [47]:
from sklearn.metrics import confusion_matrix
cmat = confusion_matrix(y_true=test_data['sentiment'].as_matrix(),
                        y_pred=model.predict(test_matrix),
                        labels=model.classes_)    # use the same order of class as the LR model.
print(' target_label | predicted_label | count ')
print('--------------+-----------------+-------')
# Print out the confusion matrix.
# NOTE: Your tool may arrange entries in a different order. Consult appropriate manuals.
for i, target_label in enumerate(model.classes_):
    for j, predicted_label in enumerate(model.classes_):
        print('{0:^13} | {1:^15} | {2:5d}'.format(target_label, predicted_label, cmat[i,j]))

 target_label | predicted_label | count 
--------------+-----------------+-------
     -1       |       -1        |  3788
     -1       |        1        |  1453
      1       |       -1        |   805
      1       |        1        | 27290
