In [3]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Amazon Reviews - Naive Bayes

Classifies unique words from reviews as positive or negative, then uses these lists to predict review scores

In [4]:
amazon = pd.read_table('amazon_cells_labelled.txt', header=None)
amazon.columns = ['text', 'negative']
amazon = pd.DataFrame(amazon)
amazon['text'] = amazon.text.str.lower() #lowercase all rows
amazon.head()

Unnamed: 0,text,negative
0,so there is no way for me to plug it in here i...,0
1,"good case, excellent value.",1
2,great for the jawbone.,1
3,tied to charger for conversations lasting more...,0
4,the mic is great.,1


In [5]:
#set score column to boolean for comparisons, negative = True
amazon['negative'] = (amazon['negative'] == 0)

In [6]:
#count unique words
amazon_words = pd.DataFrame(amazon['text'].str.split(' ', expand=True).stack().value_counts())
amazon_words.columns = ['total_count']
amazon_words.head()

Unnamed: 0,total_count
the,513
i,313
and,310
is,238
it,237


In [7]:
#get counts for negative and positive reviews
amazon_negative = amazon[amazon['negative']==True]
amazon_positive = amazon[amazon['negative']==False]

amazon_negative_count = pd.DataFrame(amazon_negative['text'].str.split(' ', expand=True).stack().value_counts())
amazon_negative_count.columns = ['negative_count']
amazon_positive_count = pd.DataFrame(amazon_positive['text'].str.split(' ', expand=True).stack().value_counts())
amazon_positive_count.columns = ['positive_count']

#join together
count_joined = amazon_negative_count.join(amazon_positive_count)
count_joined = count_joined.join(amazon_words['total_count'])

#calculate probabilites and join
count_joined['negative_prob'] = count_joined['negative_count']/count_joined['total_count']
count_joined['positive_prob'] = count_joined['positive_count']/count_joined['total_count']
count_joined.head()

Unnamed: 0,negative_count,positive_count,total_count,negative_prob,positive_prob
the,276,237.0,513,0.538012,0.461988
i,162,151.0,313,0.517572,0.482428
it,129,108.0,237,0.544304,0.455696
and,122,188.0,310,0.393548,0.606452
a,113,104.0,217,0.520737,0.479263


In [8]:
#put higher negative probabilty words into list
amzn_words_negative = pd.DataFrame(count_joined['negative_prob'] > count_joined['positive_prob'])
amzn_words_negative.columns = ['greater']
amzn_words_negative = amzn_words_negative[amzn_words_negative['greater'] == True]
amzn_words_negative = amzn_words_negative.index.values

In [9]:
#run naive bayes classifier
for word in amzn_words_negative:
    amazon[str(word)] = amazon.text.str.contains(' ' + str(word) + ' ')
    
amzn_data_neg = amazon[amzn_words_negative]
amzn_target_neg = amazon['negative']

from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(amzn_data_neg, amzn_target_neg)
amzn_pred_neg = bnb.predict(amzn_data_neg)

print('Number of mislabeled points out of a total {} points: {}'
      .format(amzn_data_neg.shape[0],(amzn_target_neg != amzn_pred_neg).sum()))

Number of mislabeled points out of a total 1000 points: 332


In [10]:
#do same but from positive angle
amzn_words_positive = pd.DataFrame(count_joined['positive_prob'] > count_joined['negative_prob'])
amzn_words_positive.columns = ['greater']
amzn_words_positive = amzn_words_positive[amzn_words_positive['greater'] == True]
amzn_words_positive = amzn_words_positive.index.values

In [11]:
#run naive bayes classifier
for word in amzn_words_positive:
    amazon[str(word)] = amazon.text.str.contains(' ' + str(word) + ' ')
    
amzn_data_pos = amazon[amzn_words_positive]
amzn_target_pos = amazon['negative']

from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(amzn_data_pos, amzn_target_pos)
amzn_pred_pos = bnb.predict(amzn_data_pos)

print('Number of mislabeled points out of a total {} points: {}'
      .format(amzn_data_pos.shape[0],(amzn_target_pos != amzn_pred_pos).sum()))

Number of mislabeled points out of a total 1000 points: 343
