In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Amazon data

In [2]:
amazon = pd.read_table('amazon_cells_labelled.txt', header=None)
amazon.columns = ['text', 'score']

keywords = [line.rstrip() for line in open('positive-words.txt')]

for key in keywords:
    amazon[str(key)] = amazon.text.str.contains(' '+str(key)+' ', case=False)
    
#set score column into a boolean
amazon['score'] = (amazon['score'] == 1)

data = amazon[keywords]
target = amazon['score']

from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(data, target)
y_pred = bnb.predict(data)

print('Number of mislabeled points out of a total {} points: {}'.format(data.shape[0],(target != y_pred).sum()))

Number of mislabeled points out of a total 1000 points: 272


In [3]:
#check for overfitting
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.71, 0.67, 0.69, 0.68, 0.72, 0.64, 0.71, 0.65, 0.7 , 0.65])

## IMDB data

In [4]:
imdb = pd.read_table('imdb_labelled.txt', header=None)
imdb.columns = ['text', 'score']

for key in keywords:
    imdb[str(key)] = imdb.text.str.contains(' '+str(key)+' ', case=False)

imdb['score'] = (imdb['score'] == 1)

data_imdb = imdb[keywords]
target_imdb = imdb['score']

bnb.fit(data_imdb, target_imdb)
y_pred_imdb = bnb.predict(data_imdb)

print('Number of mislabeled points out of a total {} points: {}'
      .format(data_imdb.shape[0],(target_imdb != y_pred_imdb).sum()))

Number of mislabeled points out of a total 748 points: 197


In [5]:
cross_val_score(bnb, data_imdb, target_imdb, cv=10)

array([0.64473684, 0.60526316, 0.68      , 0.58666667, 0.72      ,
       0.6       , 0.7027027 , 0.63513514, 0.51351351, 0.67567568])

## Yelp data

In [6]:
yelp = pd.read_table('yelp_labelled.txt', header=None)
yelp.columns = ['text', 'score']

for key in keywords:
    yelp[str(key)] = yelp.text.str.contains(' '+str(key)+' ', case=False)

yelp['score'] = (yelp['score'] == 1)

data_yelp = yelp[keywords]
target_yelp = yelp['score']

bnb.fit(data_yelp, target_yelp)
y_pred_yelp = bnb.predict(data_yelp)

print('Number of mislabeled points out of a total {} points: {}'
      .format(data_yelp.shape[0],(target_yelp != y_pred_yelp).sum()))

Number of mislabeled points out of a total 1000 points: 278


In [7]:
cross_val_score(bnb, data_yelp, target_yelp, cv=10)

array([0.67, 0.67, 0.67, 0.67, 0.69, 0.65, 0.75, 0.67, 0.63, 0.73])

## Conclusion

Models may be overfitting to a small degree