In [266]:
# DATA : https://www.kaggle.com/paoloripamonti/twitter-sentiment-analysis/data?select=training.1600000.processed.noemoticon.csv
# after downloading, change .csv file name to tweets.csv

401 - Unauthorized


## Libraries

In [235]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
stop_words = stopwords.words("english")

## Load Data

Data contains tweets with positive and negative sentiment.

In [244]:
data = pd.read_csv("tweets.csv", encoding="ISO-8859-1", names = ["target", "ids", "date", "flag", "user", "text"])

In [245]:
data = data.sample(frac=1) # randomize the data

In [246]:
data = data[["text", "target"]] 

In [247]:
decode_map = {0: 0, 4: 1}
def decode_sentiment(label):
    return decode_map[label]

data["target"] = pd.DataFrame(data["target"].apply(lambda x: decode_sentiment(x)))

In [248]:
data = data[0:1000000]
data

Unnamed: 0,text,target
836318,hi @ana_alemana ! I would suggest music of my ...,1
239034,Haave To gO nOW haha byee dont miss me to mu...,0
955498,Today is sightseeing day! Get out of the house...,1
340990,I dont wanna get old.ever. My back will be my ...,0
212840,Leaving my house and wont be back till Monday....,0
...,...,...
673919,"uh, oh. i may have deleted all my e-mails by a...",0
836004,@wagnerofficial You're a trip Jill,1
800435,will invite people to twitter.,1
1414817,@tommcfly yerrrp u should,1


In [249]:
len(data[data['target'] == 0]), len(data[data['target'] == 1])

(499971, 500029)

## Preprocessing

In [250]:
def preprocess(text):
    # Remove links,user ids and special characters
    text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            tokens.append(token)
    return " ".join(tokens)

In [251]:
data["text"] = data["text"].apply(lambda x: preprocess(x))

In [252]:
data

Unnamed: 0,text,target
836318,hi ana alemana would suggest music electro pro...,1
239034,haave go haha byee dont miss much www youtube ...,0
955498,today sightseeing day get house shtuff,1
340990,dont wanna get old ever back death ugh,0
212840,leaving house wont back till monday scared eve...,0
...,...,...
673919,uh oh may deleted e mails accident,0
836004,trip jill,1
800435,invite people twitter,1
1414817,yerrrp u,1


## Train-Test Split

In [253]:
train, test = train_test_split(data, test_size=0.2, random_state=42)
print("train:", len(train))
print("test:", len(test))

train: 800000
test: 200000


In [254]:
X_train = train["text"]
y_train = train["target"]
X_test = test["text"]
y_test = test["target"]

## Naive Bayes

In [255]:
## count words in tweets according to their labels
## {(word, label): count}
def count_tweets(result, tweets, ys):
    for y, tweet in zip(ys, tweets):
        for word in tweet.split():
            # define the word and label tuple
            pair = (word,y)

            # if the (word, label) tuple exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the (word, label) tuple is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1

    return result

In [256]:
frequencies = count_tweets({}, X_train, y_train)

In [257]:
frequencies

{('woke', 0): 2622,
 ('blue', 0): 492,
 ('skies', 0): 62,
 ('first', 0): 3648,
 ('time', 0): 13798,
 ('3', 0): 8006,
 ('weeks', 0): 2056,
 ('live', 0): 2645,
 ('desert', 0): 53,
 ('normal', 0): 402,
 ('weather', 0): 3121,
 ('bad', 0): 10939,
 ('storm', 0): 501,
 ('last', 0): 11206,
 ('night', 0): 10096,
 ('scared', 0): 1153,
 ('dog', 0): 1624,
 ('know', 1): 12928,
 ('still', 1): 7178,
 ('want', 1): 6202,
 ('coach', 1): 107,
 ('ron', 1): 53,
 ('jeremy', 1): 81,
 ('team', 1): 828,
 ('disney', 1): 358,
 ('win', 1): 1716,
 ('trophy', 1): 29,
 ('getting', 1): 5441,
 ('engaged', 1): 46,
 ('congrats', 1): 1687,
 ('way', 1): 6405,
 ('unles', 1): 1,
 ('joking', 1): 97,
 ('work', 1): 9811,
 ('super', 1): 1404,
 ('tired', 1): 2136,
 ('better', 1): 6329,
 ('mood', 1): 716,
 ('checked', 1): 255,
 ('bill', 1): 195,
 ('country', 1): 407,
 ('mart', 1): 38,
 ('tonight', 1): 6365,
 ('traitor', 1): 6,
 ('true', 1): 1647,
 ('good', 1): 31087,
 ('advice', 1): 352,
 ('day', 1): 23791,
 ('wish', 0): 11110,
 

In [258]:
## this is a helper method to get the target (word, label) frequency
def lookup(freqs,word,target):
    if frequencies.get((word, target)) != None:
        return frequencies.get((word, target))
    else:
        return 0

In [259]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior
        loglikelihood: the log likelihood of Naive bayes equation
    '''
    loglikelihood = {}
    logprior = 0

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate number of positive and negative words
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive 
        if pair[1] == 1:
            
            # Increment the number of positive words by the count for this (word, label) tuple
            N_pos += freqs[pair]

        # else, the label is negative
        else:

            # increment the number of negative words by the count for this (word,label) tuple
            N_neg += freqs[pair]


    # Calculate the number of documents
    D = len(train_y)

    # Calculate the number of positive documents 
    D_pos = (len(list(filter(lambda x: x == 1, train_y))))
    # Calculate the number of negative documents 
    D_neg = (len(list(filter(lambda x: x == 0, train_y))))
    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)
        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)
        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior, loglikelihood

In [260]:
logprior, loglikelihood = train_naive_bayes(frequencies, X_train, y_train)


## Test 

In [263]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    # process the tweet to get a list of words
    word_l = preprocess(tweet).split()

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p

In [264]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    accuracy = 0 

    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0, then positive sentiment
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0, then negative sentiment
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.mean(np.absolute(y_hats-test_y))

    # Accuracy is 1 minus the error
    accuracy = 1-error
    return accuracy

In [265]:
test_naive_bayes(X_test, y_test, logprior, loglikelihood)

0.766095

## TODO

This was Multinomial Naive Bayes. 

* Now, please apply Bernoulli and Binary Naive Bayes. 
* Analyze their results in terms of accuracy, precision, recall and f1-score.