In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Amazon Dataset

Use naive bayes classifier

In [2]:
amazon = pd.read_table('amazon_cells_labelled.txt', header=None)
amazon.columns = ['text', 'score']
amazon.head()

Unnamed: 0,text,score
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [3]:
keywords = [line.rstrip() for line in open('positive-words.txt')]

for key in keywords:
    amazon[str(key)] = amazon.text.str.contains(' '+str(key)+' ', case=False)

__Positive keywords list sourced from:__

This file and the papers can all be downloaded from 
    http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html

If you use this list, please cite one of the following two papers:

   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." 
       Proceedings of the ACM SIGKDD International Conference on Knowledge 
       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, 
       Washington, USA,
       
   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing 
       and Comparing Opinions on the Web." Proceedings of the 14th 
       International World Wide Web conference (WWW-2005), May 10-14, 
       2005, Chiba, Japan.

In [4]:
#set score column into a boolean
amazon['score'] = (amazon['score'] == 1)

In [5]:
#sns.heatmap(amazon.corr())

In [6]:
data = amazon[keywords]
target = amazon['score']

In [7]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(data, target)
y_pred = bnb.predict(data)

print('Number of mislabeled points out of a total {} points: {}'.format(data.shape[0],(target != y_pred).sum()))

Number of mislabeled points out of a total 1000 points: 272


## Test on other datasets

### IMDB

In [8]:
imdb = pd.read_table('imdb_labelled.txt', header=None)
imdb.columns = ['text', 'score']
imdb.head()

Unnamed: 0,text,score
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [9]:
for key in keywords:
    imdb[str(key)] = imdb.text.str.contains(' '+str(key)+' ', case=False)

imdb['score'] = (imdb['score'] == 1)

data_imdb = imdb[keywords]
target_imdb = imdb['score']

bnb.fit(data_imdb, target_imdb)
y_pred_imdb = bnb.predict(data_imdb)

print('Number of mislabeled points out of a total {} points: {}'
      .format(data_imdb.shape[0],(target_imdb != y_pred_imdb).sum()))

Number of mislabeled points out of a total 748 points: 197


### Yelp

In [10]:
yelp = pd.read_table('yelp_labelled.txt', header=None)
yelp.columns = ['text', 'score']
yelp.head()

Unnamed: 0,text,score
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [11]:
for key in keywords:
    yelp[str(key)] = yelp.text.str.contains(' '+str(key)+' ', case=False)

yelp['score'] = (yelp['score'] == 1)

data_yelp = yelp[keywords]
target_yelp = yelp['score']

bnb.fit(data_yelp, target_yelp)
y_pred_yelp = bnb.predict(data_yelp)

print('Number of mislabeled points out of a total {} points: {}'
      .format(data_yelp.shape[0],(target_yelp != y_pred_yelp).sum()))

Number of mislabeled points out of a total 1000 points: 278
