In [1]:
import numpy as np
import nltk
import re

from utils import preprocess_tweet, build_freqs, lookup
from nltk.corpus import stopwords, twitter_samples

In [2]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

test_pos = pos_tweets[4000:]
train_pos = pos_tweets[:4000]
test_neg = neg_tweets[4000:]
train_neg = neg_tweets[:4000]

x_train = train_pos + train_neg
x_test = test_pos + test_neg

y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)))
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)))

In [3]:
freqs = build_freqs(x_train, y_train)

In [6]:
def train_naive_bayes(freqs, x_train, y_train):
    
    log_likelihood = {}
    logprior = 0
    
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    
    n_pos = 0
    n_neg = 0
    
    for pair in freqs.keys():
        
        if pair[1] > 0:
            n_pos += freqs[pair]
        else:
            n_neg += freqs[pair]
        
    D = len(x_train)
    D_pos = sum(y_train == 1)
    D_neg = D - D_pos
    
    logprior = np.log(D_pos / D_neg)
    
    for word in vocab:
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)
            
        p_w_pos = (freq_pos + 1) / (n_pos + V)
        p_w_neg = (freq_neg + 1) / (n_neg + V)
        
        log_likelihood[word] = np.log(p_w_pos / p_w_neg)
        
    return logprior, log_likelihood

In [7]:
lp, ll = train_naive_bayes(freqs, x_train, y_train)

In [8]:
print(lp)
print(len(ll))

0.0
9159


In [9]:
def predict_naive_bayes(tweet, logprior, loglikelihood):
    
    words = preprocess_tweet(tweet)
    
    p = 0
    
    p += logprior
    
    for word in words:
        if word in loglikelihood:
            p += loglikelihood[word]
            
    return p

In [10]:
predict_naive_bayes('i smiled', lp, ll)

1.5381262672501004

In [11]:
def test_naive_bayes(x_test, y_test, logprior, loglikelihood):
    
    num_correct = 0
    
    preds = []
    for tw in x_test:
        if predict_naive_bayes(tw, logprior, loglikelihood) > 0:
            preds.append(1)
        else:
            preds.append(0)
            
    error = np.mean(np.absolute(preds - y_test))
    acc = 1 - error
    
    return acc

In [13]:
test_naive_bayes(x_test, y_test, lp, ll)

0.9975