## Sentiment analysis using naive bayes

In [6]:
import re
import string
import pdb
import nltk
import string
from nltk.corpus import stopwords,twitter_samples
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from matplotlib.patches import Ellipse
from matplotlib import transforms
import numpy as np
import pandas as pd

In [4]:
def process_tweet(tweet):

    ''' 
    Input: 
        tweet: str
    Output:
        cleaned_tweet: list of strs
    '''
    stemmer = PorterStemmer()
    stop_words = stopwords.words('english')

    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tokens = tokenizer.tokenize(tweet)

    cleaned_tweet = []

    for word in tokens:
        if word not in stop_words and word not in string.punctuation:
            stem_word = stemmer.stem(word)
            cleaned_tweet.append(stem_word)

    return cleaned_tweet

In [5]:
def lookup(freqs,word,label):

    ''' 
    Input:
        freqs: dict
        word: str
        label: int
    '''
    n = 0

    pair = (word,label)
    if (pair in freqs): n = freqs[pair]

    return n

In [7]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [8]:
def count_tweets(result,tweets,ys):

    ''' 
    Input:
        result: dict
        tweets: list of tweets
        ys: list of sentiments (ints)
    Output:
        result: dict mapping pair to frequency
    '''

    for y,tweet in zip(ys,tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in result: result[pair] += 1
            else: result[pair] = 1

    return result

In [9]:
freqs = count_tweets({}, train_x, train_y)

In [10]:
def train_naive_bayes(freqs,train_x,train_y):

    ''' 
    Input:
        freqs: dict
        train_x: list of tweets
        train_y: list of sentiment labels
    Output:
        logprior
        loglikelihood
    '''

    loglikelihood = {}
    logprior = 0

    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]
        else: N_neg += freqs[pair]

    D = len(train_y)
    D_pos = (len(list(filter(lambda x: x > 0, train_y))))
    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))

    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)

        p_w_pos = (freq_pos + 1)/(N_pos + V)
        p_w_neg = (freq_neg + 1)/(N_neg + V)

        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior,loglikelihood


In [11]:
def naive_bayes_predict(tweet,logprior,loglikelihood):

    ''' 
    Input:
        tweet: str
        logprior: float
        loglikelihood: dict mapping words to numbers
    Output:
        p: sum of loglikelihoods
    '''

    word_l = process_tweet(tweet)

    p = 0
    p += logprior

    for word in word_l:

        if word in loglikelihood: p += loglikelihood[word]

    return p

In [15]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)


1.5542634605271097


In [16]:
def test_naive_bayes(test_x,test_y,logprior,loglikelihood,naive_bayes_predict=naive_bayes_predict):

    ''' 
    Input: 
        test_x: list of strs
        test_y: corresponding sentiment labels
        logprior
        loglikelihood
    Output:
        accuracy
    '''

    accuracy = 0

    y_hats = []
    for tweet in test_x:
        if naive_bayes_predict(tweet,logprior,loglikelihood) > 0:
            y_hat_i = 1
        else: 
            y_hat_i = 0

        y_hats.append(y_hat_i)

    error = np.mean(np.absolute(y_hats-test_y))
    accuracy = 1-error

    return accuracy

In [17]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))


Naive Bayes accuracy = 0.9955


Test with example tweets:

In [23]:
tweet = 'I am very happy today!'
p = naive_bayes_predict(tweet, logprior, loglikelihood)
print(p)

2.153758514183751


In [22]:
tweet = 'I am very very sad today :('
p = naive_bayes_predict(tweet, logprior, loglikelihood)
print(p)

-10.349699515372144
