In [9]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import BernoulliNB

In [2]:
#Extract the TXT from the ZIP file
import zipfile
filename = "Documents/Thinkful/Module 18/sentiment_labelled_sentences.zip"

zf = zipfile.ZipFile(filename)
imdb_raw = pd.read_csv(zf.open('sentiment labelled sentences/imdb_labelled.txt'), delimiter = '\t', header=None)
imdb_raw.columns = ['message', 'sentiment']

In [3]:
imdb_raw.head()

Unnamed: 0,message,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
#Clean the data so we can extract similar words, i.e. turn everything lowercase and remove punctuation
clean_msg = [row.lower()
           .replace(",", "").replace(".", "").replace("!", "").replace("?", "")
           .replace(";", "").replace(":", "").replace("*", "")
           .replace(" - ", " ").replace("(", "")
           .replace(")", "").replace("/", "")
           for row in imdb_raw['message']]

imdb_raw['message'] = clean_msg
imdb_raw.head()

Unnamed: 0,message,sentiment
0,a very very very slow-moving aimless movie abo...,0
1,not sure who was more lost the flat characters...,0
2,attempting artiness with black & white and cle...,0
3,very little music or anything to speak of,0
4,the best scene in the movie was when gerardo i...,1


In [5]:
#Split into two datasets: positive and negative
pos_msg = imdb_raw['message'][imdb_raw['sentiment']==1]
pos_msg = list(pos_msg.reset_index(drop=True))

neg_msg = imdb_raw['message'][imdb_raw['sentiment']==0]
neg_msg = list(neg_msg.reset_index(drop=True))

#Combine all strings into one long string for each dataset
seperator = ' '
pos_long = seperator.join(pos_msg)
neg_long = seperator.join(neg_msg)

#Extract words and number of repeats in each dataset
from collections import Counter
pos_counts = Counter(pos_long.split())
neg_counts = Counter(neg_long.split())

In [6]:
#Get list of positive and negative words that appear at least twice
pos_words = [x[0] for x in pos_counts.most_common() if x[1] >= 2]
neg_words = [x[0] for x in neg_counts.most_common() if x[1] >= 2]

#Remove any words that appear in both lists
for word in pos_words[:]:
    if word in neg_words:
        pos_words.remove(word)
        neg_words.remove(word)

In [7]:
#Now that we have a list of positive and negative keywords, compare them to all messages
for word in pos_words:
    imdb_raw['Positive: ' + str(word)] = imdb_raw.message.str.contains(' ' + str(word) + ' ', case=False)
    
for word in neg_words:
    imdb_raw['Negative: ' + str(word)] = imdb_raw.message.str.contains(' ' + str(word) + ' ', case=False)
       
#Convert sentiment column into boolean for positive sentiment
imdb_raw['sentiment'] = (imdb_raw['sentiment'] == 1)

### Test the model

In [8]:
#Use Bernoulli to test for accuracy of the model based on current keywords
data = imdb_raw[imdb_raw.columns.drop(['message','sentiment'])]
target = imdb_raw['sentiment']

In [11]:
# Train model and test, using entire data set, as in Module 18.6
bnb1 = BernoulliNB()
bnb1.fit(data, target)
score1 = bnb1.score(data, target)

print("Score when using the entire sample: " + str(round(score1,2)))

Score when using the entire sample: 0.86


In [15]:
#Test the model with holdout groups
from sklearn.model_selection import train_test_split
bnb2 = BernoulliNB()

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('Score with 20% Holdout: ' + str(bnb2.fit(X_train, y_train).score(X_test, y_test)))

Score with 20% Holdout: 0.8266666666666667


In [12]:
#Test the model via cross-validation (10 folds)
from sklearn.model_selection import cross_val_score
bnb3 = BernoulliNB()
folds = 10
scores3 = cross_val_score(bnb3, data, target, cv=folds)

print("Scores when cross-validation is used with 10 folds \n", scores3)

Scores when cross-validation is used with 10 folds 
 [0.80263158 0.77631579 0.82666667 0.74666667 0.74666667 0.76
 0.77027027 0.7972973  0.67567568 0.75675676]


### Tune parameters to improve accuracy

In [17]:
#Tune size of holdout group and parameter 'random_state' used for initializing the random number generator
bnb4 = BernoulliNB()
group_size = []
random_state = []
values = []

for i in range(100):    #range of possible random_state values (100 is an aribitrary range. It could be larger or smaller, as needed)
    for size in np.arange(0.10,0.55,0.05):      #range of possible holdout group sizes
        size = round(size,2)     #some values in the array may have weird extra decimals
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=size, random_state=i)
        values.append(round(bnb4.fit(X_train, y_train).score(X_test, y_test),2))
        random_state.append(i)
        group_size.append(size)
        
ind = values.index(max(values))    #index where the optimized value is located
print("Maximum accuracy is obtained with a random_state value of: ", random_state[ind])
print("Maximum accuracy is obtained with a holdout group_size of: ", group_size[ind])
print("When both optimized values are used, the calculated accuracy is: ", max(values))


Maximum accuracy is obtained with a random_state value of:  37
Maximum accuracy is obtained with a holdout group_size of:  0.1
When both optimized values are used, the calculated accuracy is:  0.88


In [54]:
#Tune number of folds to optimize cross-validation accuracy
bnb5 = BernoulliNB()
scores5 = []
cross_val = []
avg_acc = []
folds = list(range(2,20))     #Arbitrary range of folds. Could have done more or fewer.

for fold in folds:
    scores5 = cross_val_score(bnb5, data, target, cv=fold)
    avg_acc.append(round(scores5.mean(),2))    #greatest average accuracy
    cross_val.append([round(x,2) for x in scores5])
    
ind = avg_acc.index(max(avg_acc))    #index where the optimized value is located
print("Maximum accuracy is obtained with {} folds.".format(folds[ind]))
print("When {} folds are used, the cross-validation values are: \n{}".format(folds[ind], cross_val[ind]))

Maximum accuracy is obtained with 15 folds.
When 15 folds are used, the cross-validation values are: 
[0.76, 0.82, 0.82, 0.84, 0.74, 0.78, 0.74, 0.84, 0.78, 0.78, 0.86, 0.76, 0.57, 0.82, 0.76]


Based on the results of the test using holdout groups and cross-validation, I don't believe my classifier has overfit, since all of the values come out to approximately 80-86%, which shows that there isn't a significant difference between separated groups. The test in which I used an optimized random_state value and holdout group size provided the greatest overall accuracy (88%), which makes sense, as it was by that method that I iterated through hundreds of possible combinations. Performance of the model seems to be most impacted by the number of folds in cross-validation and the size of the holdout group.