In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import scipy
import sklearn
%matplotlib inline

In [2]:
# Grab and process the raw data.
raw = pd.read_csv('imdb_labelled.txt', delimiter= '\t', header=None)
raw.columns = ['message','favor']

In [3]:
raw.describe()

Unnamed: 0,favor
count,748.0
mean,0.516043
std,0.500077
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [4]:
# add some filters
keywords = ['best', 'wonderful', 'incredible', 'good', 'fantastic', 'fast', 'excellent', 'heaven', 'fresh', 'gem', 'favorite',
           'funny', 'hilarious', 'masterpiece','brilliance']

for key in keywords: 
    raw[str(key)] = raw.message.str.contains(
        str(key),
        case=False
    )


In [5]:
# sklearn requires some inputs
data = raw[keywords]
target = raw['favor']

In [6]:
data

Unnamed: 0,best,wonderful,incredible,good,fantastic,fast,excellent,heaven,fresh,gem,favorite,funny,hilarious,masterpiece,brilliance
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [7]:
# apply Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 748 points : 305


In [8]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[338,  24],
       [281, 105]], dtype=int64)

In [12]:
# try using negative keywords to identify negative reviews (instead of positive)

neg_keywords = ['poor', 'bad', 'disappoint', 'boring', 'lame', 'waste', 'predictable', 'worst', 'pointless', 'shallow',
           'annoy', 'regret', 'unfunny','insincere', 'doom','lack']

for key in neg_keywords:
    raw[str(key)] = raw.message.str.contains(
        str(key),
        case=False
    )
# add an unfavor column for negative operations
raw['unfavor'] = raw.apply(lambda row: not row.favor, axis=1)

In [14]:
ntarget = raw['unfavor']
n_data = raw[neg_keywords]
bnb.fit(n_data, ntarget)

y_pred = bnb.predict(n_data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    n_data.shape[0],
    (ntarget != y_pred).sum()
))

Number of mislabeled points out of a total 748 points : 268


In [15]:
# that is an improvement
# try adding not to the mix
neg_keywords = ['poor', 'bad', 'disappoint', 'boring', 'lame', 'waste', 'predictable', 'worst', 'pointless', 'shallow',
           'annoy', 'regret', 'unfunny','insincere','lack','not']

for key in neg_keywords:
    raw[str(key)] = raw.message.str.contains(
        str(key),
        case=False,
    )

In [16]:
n_data = raw[neg_keywords]
bnb.fit(n_data, ntarget)

y_pred = bnb.predict(n_data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    n_data.shape[0],
    (ntarget != y_pred).sum()
))

Number of mislabeled points out of a total 748 points : 249


In [17]:
# additional improvement. How does the confusion matrix look?
confusion_matrix(ntarget, y_pred)

array([[352,  34],
       [215, 147]], dtype=int64)

In [20]:
# let's add some holdouts and see how it holds up.
y_pred = bnb.fit(n_data, ntarget).predict(n_data)
from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(n_data, ntarget, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(n_data, ntarget).score(n_data, ntarget)))

With 20% Holdout: 0.6733333333333333
Testing on Sample: 0.6671122994652406


In [21]:
#  That's very close.
# let's add some folds and see how it holds up.
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, n_data, ntarget, cv=10)

array([0.69736842, 0.61842105, 0.65333333, 0.69333333, 0.68      ,
       0.62666667, 0.62162162, 0.64864865, 0.62162162, 0.74324324])

In [24]:
# This indicates a bit more variance - ranging from .62 to .74 where as the holdout test result is
# extremely close.
# Let's see if we can add a handful of features

for result in raw['favor']:
    raw['like'] = ( (raw['best'] == True) | (raw['wonderful'] == True) | (raw['incredible'] == True) | 
            (raw['good'] == True) | (raw['fantastic'] == True) | (raw['fast'] == True) |
            (raw['excellent'] == True) | (raw['heaven'] == True) | (raw['fresh'] == True) |
            (raw['gem'] == True) | (raw['favorite'] == True) | (raw['funny'] == True) |
            (raw['hilarious'] == True) | (raw['masterpiece'] == True) | (raw['brilliance'] == True)  ) 
    
    raw['dislike'] = ( (raw['poor'] == True) | (raw['bad'] == True) | (raw['disappoint'] == True) | 
            (raw['boring'] == True) | (raw['lame'] == True) | (raw['waste'] == True) |
            (raw['predictable'] == True) | (raw['worst'] == True) | (raw['pointless'] == True) |
            (raw['shallow'] == True) | (raw['annoy'] == True) | (raw['regret'] == True) | (raw['not'] == True) | 
            (raw['unfunny'] == True) | (raw['insincere'] == True) | (raw['lack'] == True)  )

In [25]:
raw['both'] = raw.apply(lambda row: row.like and row.dislike, axis=1)

In [26]:
# Let's run the eval adding in a filter for handling both (stratify the both flags out of data)
ntarget = raw['unfavor'][raw['both'] != True]
n_data = raw[neg_keywords][raw['both'] != True]
bnb.fit(n_data, ntarget)

y_pred = bnb.predict(n_data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    n_data.shape[0],
    (ntarget != y_pred).sum()
))

Number of mislabeled points out of a total 727 points : 241


In [27]:
# Let's run the eval adding in a filter for handling both (stratify the both flags out of data) - this time on favorables
target = raw['favor'][raw['both'] != True]
data = raw[keywords][raw['both'] != True]
bnb.fit(data, target)

y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
# negative keywords still providing better results

Number of mislabeled points out of a total 727 points : 292


In [29]:
# How does the confusion matrix look?
confusion_matrix(ntarget, y_pred)

array([[283,  95],
       [340,   9]], dtype=int64)

In [34]:
# Let's start with a clean data file again and then work the negative route with "not" separated
ndf = pd.read_csv('imdb_labelled.txt', delimiter= '\t', header=None)
ndf.columns = ['message','favor']
ndf_keywords = ['poor', 'bad', 'disappoint', 'boring', 'lame', 'waste', 'predictable', 'worst', 'pointless', 'shallow',
           'annoy', 'regret', 'unfunny','insincere', 'doom','lack']

for key in ndf_keywords:
    ndf[str(key)] = ndf.message.str.contains(
        str(key),
        case=False
    )
# add an unfavor column for negative operations
ndf['unfavor'] = ndf.apply(lambda row: not row.favor, axis=1)
# add column to indicate any of the negative words
ndf['dislike'] = ndf.any(axis=1, bool_only=bool)
# add column to identify "not"
ndf['contra'] = ndf.message.str.contains('not', case=False)
# add column to flag items with both a dislike keyword and a not
ndf['contra_neg'] = ndf.apply(lambda row: row.contra and row.dislike, axis=1)

In [45]:
# evaluate these classifiers

sub_df = ndf[['dislike','contra']]
ndata = sub_df[(sub_df['contra'] == False)]
ntarget = ndf['unfavor'][(ndf['contra'] == False)]

# Fit our model to the data.
bnb.fit(ndata, ntarget)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(ndata)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    ndata.shape[0],
    (ntarget != y_pred).sum()
))

Number of mislabeled points out of a total 675 points : 11


In [46]:
# How does the confusion matrix look?
confusion_matrix(ntarget, y_pred)

array([[352,  11],
       [  0, 312]], dtype=int64)

In [47]:
# let's try a holdout test and see how it holds up.
y_pred = bnb.fit(ndata, ntarget).predict(ndata)
from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(ndata, ntarget, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(ndata, ntarget).score(ndata, ntarget)))

With 20% Holdout: 0.9703703703703703
Testing on Sample: 0.9837037037037037


In [48]:
#  That's very close.
# let's add some folds and see how it holds up.
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, ndata, ntarget, cv=10)

array([0.97101449, 1.        , 1.        , 0.97014925, 0.95522388,
       0.98507463, 1.        , 0.98507463, 0.97014925, 1.        ])

In [49]:
# Now let's do it again work the positive route with "not" separated
pdf = pd.read_csv('imdb_labelled.txt', delimiter= '\t', header=None)
pdf.columns = ['message','favor']
pdf_keywords = ['best', 'wonderful', 'incredible', 'good', 'fantastic', 'fast', 'excellent', 'heaven', 'fresh', 'gem', 
           'favorite', 'funny', 'hilarious', 'masterpiece','brilliance', 'beautiful']

for key in pdf_keywords:
    pdf[str(key)] = pdf.message.str.contains(
        str(key),
        case=False
    )
# add column to indicate any of the negative words
pdf['like'] = pdf.any(axis=1, bool_only=bool)
# add column to identify "not"
pdf['contra'] = pdf.message.str.contains('not', case=False)
# add column to flag items with both a dislike keyword and a not
pdf['contra_pos'] = pdf.apply(lambda row: row.contra and row.like, axis=1)

In [52]:
# evaluate these classifiers
ps_data = pdf[['like','contra']]
pdata = ps_data[(ps_data['contra'] == False)]
target = pdf['favor'][(pdf['contra'] == False)]

# Fit our model to the data.
bnb.fit(pdata, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(pdata)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    pdata.shape[0],
    (target != y_pred).sum()
))
#  negative keywords producing better results for classifying

Number of mislabeled points out of a total 675 points : 275
