In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
yelp = pd.read_csv('yelp_labelled.txt', delimiter= '\t', header=None)
yelp.columns = ['review', 'positive']
display(yelp.head())

Unnamed: 0,review,positive
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
yelp['positive'] = (yelp['positive'] == 1)

In [4]:
yelp['review'] = yelp.review.str.replace(r'[^a-zA-Z\d\s:]', '')
yelp['review'] = yelp['review'].str.lower()

In [5]:
# split negative messages and combine into one list
positive_words = yelp.review[yelp.positive].str.cat(sep=' ').split()

negative_words = yelp.review[yelp.positive == False].str.cat(sep=' ').split()

In [6]:
len(np.unique(negative_words))

1397

In [7]:
len(np.unique(positive_words))

1246

In [8]:
keywords = pd.Series(positive_words).value_counts().keys().ravel()

keywords

array(['the', 'and', 'was', ..., 'company', 'almost', 'speedy'],
      dtype=object)

In [9]:
diff = np.setdiff1d(
        ar1=pd.Series(positive_words).value_counts().keys().ravel(), 
                      ar2=pd.Series(negative_words).value_counts().keys().ravel()
                     )
diff

array(['2007', '23', '6', '7', '70', 'absolute', 'absolutley', 'accident',
       'accommodations', 'accomodate', 'accordingly', 'across', 'added',
       'affordable', 'afternoon', 'airport', 'almonds', 'amazingrge',
       'ambience', 'ample', 'andddd', 'app', 'appetizers', 'approval',
       'aria', 'array', 'assure', 'atmosphere1', 'auju', 'awesome',
       'ayce', 'az', 'baba', 'bacon', 'baklava', 'bank', 'bargain',
       'bartender', 'bartenders', 'baseball', 'bbq', 'bean', 'beateous',
       'beautiful', 'beautifully', 'beauty', 'beers', 'bellies', 'belly',
       'biscuit', 'bits', 'blanket', 'block', 'bloddy', 'blue', 'bone',
       'booksomethats', 'bowl', 'boxes', 'boyfriend', 'boys', 'bread',
       'breakfastlunch', 'breeze', 'brick', 'brings', 'bruschetta',
       'buffets', 'buldogis', 'bunch', 'butter', 'caballeros', 'caesar',
       'caf', 'cakeohhh', 'calligraphy', 'cannoli', 'cape', 'capers',
       'caring', 'carpaccio', 'case', 'cavier', 'chai', 'charming',
      

In [10]:
new = yelp
keywords = diff

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    new[str(key)] = new.review.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [11]:
data = new[keywords]
target = new['positive']

In [12]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}\nAccuracy: {}%".format(
    data.shape[0],
    (target != y_pred).sum(), round((1 - (target != y_pred).sum()/data.shape[0]) * 100, 2)
))

Number of mislabeled points out of a total 1000 points : 1
Accuracy: 99.9%


In [13]:
imdb = pd.read_csv('imdb_labelled.txt', delimiter= '\t', header=None)
imdb.columns = ['review', 'positive']
display(imdb.head())

Unnamed: 0,review,positive
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [14]:
imdb['positive'] = (imdb['positive'] == 1)

imdb['review'] = imdb.review.str.replace(r'[^a-zA-Z\d\s:]', '')
imdb['review'] = imdb['review'].str.lower()

# split negative messages and combine into one list
ipositive_words = imdb.review[imdb.positive].str.cat(sep=' ').split()

inegative_words = imdb.review[imdb.positive == False].str.cat(sep=' ').split()

In [15]:
ikeywords = pd.Series(inegative_words).value_counts().keys().ravel()

idiff = np.setdiff1d(
        ar1=pd.Series(ipositive_words).value_counts().keys().ravel(), 
                      ar2=pd.Series(inegative_words).value_counts().keys().ravel()
                     )
idiff

array(['1010', '12', '15', ..., 'yelps', 'younger', 'zombie'],
      dtype=object)

In [16]:
new = imdb
keywords = idiff

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    new[str(key)] = new.review.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

data = new[keywords]
target = new['positive']

from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}\nAccuracy: {}%".format(
    data.shape[0],
    (target != y_pred).sum(), round((1 - (target != y_pred).sum()/data.shape[0]) * 100, 2)
))

Number of mislabeled points out of a total 748 points : 2
Accuracy: 99.73%


In [17]:
amazon = pd.read_csv('amazon_cells_labelled.txt', delimiter= '\t', header=None)
amazon.columns = ['review', 'positive']
display(amazon.head())

Unnamed: 0,review,positive
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [18]:
amazon['positive'] = (amazon['positive'] == 1)

amazon['review'] = amazon.review.str.replace(r'[^a-zA-Z\d\s:]', '')
amazon['review'] = amazon['review'].str.lower()

# split negative messages and combine into one list
apositive_words = amazon.review[amazon.positive].str.cat(sep=' ').split()

anegative_words = amazon.review[amazon.positive == False].str.cat(sep=' ').split()

In [19]:
akeywords = pd.Series(anegative_words).value_counts().keys().ravel()

adiff = np.setdiff1d(
        ar1=pd.Series(apositive_words).value_counts().keys().ravel(), 
                      ar2=pd.Series(inegative_words).value_counts().keys().ravel()
                     )
adiff

array(['100', '12', '15', '18', '2000', '2005', '24', '2mp', '325', '350',
       '42', '50', '5020', '510', '680', '7', '700w', '8530', ':', ':oh',
       'able', 'ac', 'access', 'accessable', 'accessing', 'accompanied',
       'according', 'activesync', 'adapters', 'additional', 'address',
       'adorable', 'advertised', 'against', 'ago', 'alarm', 'allot',
       'allows', 'alone', 'alot', 'aluminum', 'amazed', 'amazon', 'ample',
       'ant', 'antiglare', 'anywhere', 'apart', 'appears', 'applifies',
       'appointments', 'armband', 'arrival', 'arrived', 'attacked',
       'attractive', 'authentic', 'autoanswer', 'available', 'awsome',
       'background', 'bar', 'bargain', 'bars', 'batteries', 'battery',
       'beat', 'beats', 'behing', 'belt', 'bitpim', 'blackberry',
       'blacktop', 'blue', 'blueant', 'bluetooth', 'bluetoothmotorola',
       'bluetooths', 'boot', 'bose', 'boy', 'brand', 'brilliant',
       'browser', 'browsing', 'bt', 'bt250v', 'bubbling', 'build',
       'bu

In [20]:
new = amazon
keywords = adiff

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    new[str(key)] = new.review.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

data = new[keywords]
target = new['positive']

from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}\nAccuracy: {}%".format(
    data.shape[0],
    (target != y_pred).sum(), round((1 - (target != y_pred).sum()/data.shape[0]) * 100, 2)
))

Number of mislabeled points out of a total 1000 points : 303
Accuracy: 69.7%
