In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import string
from IPython.display import clear_output
import pickle
import time

In [2]:
#load precomputed word probabilities and preprocessed review dataset
with open('probs.pickle', 'rb') as f:
    probs_dict = pickle.load(f)
with open('ppReviewsMIN.pickle', 'rb') as f:
    data = pickle.load(f)

In [3]:
#new naive bayes function that uses precomputed word probabilities
def dp_get_total_probs(wordset, probs):
    posprob = 1
    negprob = 1
    for word in wordset:
        prob = probs[word]
        posprob *= prob['pos']
        negprob *= prob['neg']
    return {
        'pos': posprob,
        'neg': negprob,
        'pred': 1 if posprob > negprob else -1
    }

In [4]:
#grab reviews with non-neutral sentiment as test data
#note: because we don't have any unseen data, this test is technically invalid due to train-validation contamination
non_neutral = data.loc[abs(data['Sentiment']) == 1]

In [6]:
start = time.time()

#sample test data and perform naive bayes prediction
test = non_neutral.sample(100)
results = []
count = 0
for index, row in test.iterrows():
    review = row['CleanSet']
    probs = dp_get_total_probs(review, probs_dict)
    probs['tru'] = row['Sentiment']
    results.append(probs)
    count += 1
    clear_output(wait=True)
    print(f"Prediction {count} done.")
correct = 0
for attempt in results:
    if attempt['tru'] == attempt['pred']:
        correct += 1
correct/len(results)

#print elapsed time
elapsed = time.time() - start
print(f"NB on {len(results)} predictions using dynamic programming took {elapsed:.4f} seconds - {elapsed/len(results):.4f} seconds per prediction.")

Prediction 100 done.


0.91

NB on 100 predictions using dynamic programming took 0.2257 seconds - 0.0023 seconds per prediction.


# Comparison with old NB

In [14]:
def get_probs(word, pos, neg):
    pres = sum(pos['CleanSet'].apply(lambda x: word in x))/len(pos)
    nres = sum(neg['CleanSet'].apply(lambda x: word in x))/len(neg)
    return {
        'pos': pres,
        'neg': nres
    }

def get_total_probs(wordset, pos, neg):
    posprob = 1
    negprob = 1
    for word in wordset:
        prob = get_probs(word, pos, neg)
        posprob *= prob['pos']
        negprob *= prob['neg']
    return {
        'pos': posprob,
        'neg': negprob,
        'pred': 1 if posprob > negprob else -1
    }

In [17]:
pos = data.loc[data['Sentiment'] == 1]
neg = data.loc[data['Sentiment'] == -1]

In [22]:
start = time.time()

test = data.sample(100)
results = []
count = 0
for index, row in test.iterrows():
    review = row['CleanSet']
    probs = get_total_probs(review, pos, neg)
    probs['tru'] = row['Sentiment']
    results.append(probs)
    count += 1
    clear_output(wait=True)
    print(f"Prediction {count} done.")
correct = 0
for attempt in results:
    if attempt['tru'] == attempt['pred']:
        correct += 1
correct/len(results)

#print elapsed time
elapsed = time.time() - start
print(f"NB on {len(results)} predictions WITHOUT using dynamic programming took {elapsed:.4f} seconds - {elapsed/len(results):.4f} seconds per prediction.")

Prediction 100 done.


0.76

NB on 100 predictions WITHOUT using dynamic programming took 754.9853 seconds - 7.5499 seconds per prediction.
