In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn

## Amazon Reviews - Naive Bayes

Classifies unique words from reviews as positive or negative, then uses these lists to predict review scores

In [2]:
amazon = pd.read_table('amazon_cells_labelled.txt', header=None)
amazon.columns = ['text', 'negative']
amazon = pd.DataFrame(amazon)
amazon['text'] = amazon.text.str.lower() #lowercase all rows

In [3]:
#set score column to boolean for comparisons, negative = True
amazon['negative'] = (amazon['negative'] == 0)

In [12]:
#count unique words
amazon_words = pd.DataFrame(amazon['text'].str.split(' ', expand=True).stack().value_counts())
amazon_words.columns = ['total_count']

#get counts for negative and positive reviews
amazon_negative = amazon[amazon['negative']==True]
amazon_positive = amazon[amazon['negative']==False]

amazon_negative_count = pd.DataFrame(amazon_negative['text'].str.split(' ', expand=True).stack().value_counts())
amazon_negative_count.columns = ['negative_count']
amazon_positive_count = pd.DataFrame(amazon_positive['text'].str.split(' ', expand=True).stack().value_counts())
amazon_positive_count.columns = ['positive_count']

#join together
count_joined = amazon_negative_count.join(amazon_positive_count)
count_joined = count_joined.join(amazon_words['total_count'])

#calculate probabilites and join
count_joined['negative_prob'] = count_joined['negative_count']/count_joined['total_count']
count_joined['positive_prob'] = count_joined['positive_count']/count_joined['total_count']

#put higher negative probabilty words into list
amzn_words_negative = pd.DataFrame(count_joined['negative_prob'] > count_joined['positive_prob'])
amzn_words_negative.columns = ['greater']
amzn_words_negative = amzn_words_negative[amzn_words_negative['greater'] == True]
amzn_words_negative = amzn_words_negative.index.values

#do same but from positive angle
amzn_words_positive = pd.DataFrame(count_joined['positive_prob'] > count_joined['negative_prob'])
amzn_words_positive.columns = ['greater']
amzn_words_positive = amzn_words_positive[amzn_words_positive['greater'] == True]
amzn_words_positive = amzn_words_positive.index.values

In [6]:
#run naive bayes classifier
for word in amzn_words_negative:
    amazon[str(word)] = amazon.text.str.contains(' ' + str(word) + ' ')
    
amzn_data_neg = amazon[amzn_words_negative]
amzn_target = amazon['negative']

from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(amzn_data_neg, amzn_target)
amzn_pred_neg = bnb.predict(amzn_data_neg)

print('Number of mislabeled points out of a total {} points: {}'
      .format(amzn_data_neg.shape[0],(amzn_target != amzn_pred_neg).sum()))

Number of mislabeled points out of a total 1000 points: 332


In [8]:
#test with different holdout groups
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(amzn_data_neg, amzn_target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(amzn_data_neg, amzn_target).score(amzn_data_neg, amzn_target)))

With 20% Holdout: 0.63
Testing on Sample: 0.668


In [9]:
#perform cross validation
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, amzn_data_neg, amzn_target, cv=10)

array([0.7 , 0.58, 0.59, 0.61, 0.62, 0.49, 0.62, 0.6 , 0.56, 0.57])

In [14]:
#run it again with positive words
for word in amzn_words_positive:
    amazon[str(word)] = amazon.text.str.contains(' ' + str(word) + ' ')

amzn_data_pos = amazon[amzn_words_positive]
bnb.fit(amzn_data_pos, amzn_target)
amzn_pred_pos = bnb.predict(amzn_data_pos)

print('Number of mislabeled points out of a total {} points: {}'
      .format(amzn_data_pos.shape[0],(amzn_target != amzn_pred_pos).sum()))

Number of mislabeled points out of a total 1000 points: 343


In [15]:
#test with holdout group
X_train, X_test, y_train, y_test = train_test_split(amzn_data_pos, amzn_target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(amzn_data_pos, amzn_target).score(amzn_data_pos, amzn_target)))

With 20% Holdout: 0.605
Testing on Sample: 0.657


In [16]:
#cross validate
cross_val_score(bnb, amzn_data_pos, amzn_target, cv=10)

array([0.65, 0.55, 0.61, 0.53, 0.57, 0.54, 0.63, 0.62, 0.61, 0.63])