In [None]:
"""
1. Import twitter data
2. Split train, test
3. Process and clean the tweets
4. Calculate word freq per label
5. Calculate log prior of the tweet
6. Calculate loglikelihood of each word in the tweet
7. Calculate tweet sentiment = logprior+loglikelihood

"""

### Import

In [None]:
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd

import re
import nltk
import string
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

from os import getcwd
import warnings
warnings.filterwarnings('ignore')

nltk.download('twitter_samples')
nltk.download('stopwords')

### Get data

In [None]:
path=f'{getcwd}/../tmp/'
nltk.data.path.append(path)

In [None]:
all_positive_tweets=twitter_samples.strings('positive_tweets.json')
all_negative_tweets=twitter_samples.strings('negative_tweets.json')

### Data prep

In [None]:
len(all_positive_tweets),'+',len(all_negative_tweets)

In [None]:
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]
train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

In [None]:
train_x = train_pos + train_neg
train_y = np.append(np.ones(len(train_pos)),np.zeros(len(train_neg)))

In [None]:
test_x = test_pos + test_neg
test_y = np.append(np.ones(len(test_pos)),np.zeros(len(test_neg)))

### Process tweet

In [None]:
sample_tweet = 'RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np'

In [None]:
def process_tweet(tweet):
    
    '''
    -remove hashtag
    -remove retweet symbol eg. "RT"
    -remove stock market ticker symbols eg. $GE
    -remove hyperlinks
    -remove emoji
    -tokenize
    -remove stopwords
    -remove punctuations
    -stemming
    '''
    tweet = re.sub(r'#','',tweet) #hashtag sign rm
    tweet = re.sub(r'\$\w*','',tweet) #ticker symbol rm
    tweet = re.sub(r'https?://[^\s\n\r]+','',tweet) #hyperlinks
    tweet = re.sub(r'^RT[\s]+','',tweet) #retweet symbol rm

    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    tweet = re.sub(emoj, '', tweet)
    
    tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    tweet_tok = tknzr.tokenize(tweet)
    
    stem = PorterStemmer()
    stopword = stopwords.words('english')
    
    clean_tweets = []
    for word in tweet_tok:
        if (word not in stopword and word not in string.punctuation):
            words = stem.stem(word)
            clean_tweets.append(words)
    return clean_tweets

In [None]:
process_tweet(sample_tweet)

### Create word - label frequency

In [163]:
def count_tweet(tweets,ys):
    '''
    output - (word,y):freq
    '''
    result={}
    
    for twits,label in zip(tweets,ys):
        twts=process_tweet(twits)
     
        for word in twts:
           
            pair=(word,label)

            if pair in result:
                result[pair]+=1
            else:
                result[pair]=1

    return result

In [164]:
freqs = count_tweet(train_x,train_y)

In [165]:
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweet(tweets,ys)
#sorted(freqs.items(),key=lambda x:x[1],reverse=True)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [166]:
def train_naive_bayes(freqs,train_x,train_y):
    
    '''
    logprior for each document/tweet
    D_pos = tot pos documents/tweet
    D_neg = tot neg documents
    D = tot documents
    logprior = log(p(D_pos/D)/p(D_neg/D))
    
    loglikelihood for each word in the document
    p(W_pos)=(freq_w_pos+1)/(N_pos+V)
    p(W_neg)=(freq_w_neg+1)/(N_neg+V)
    loglikelihood = log(p(W_pos)/p(W_neg))
    freq_w_pos = freq of word as label 1
    freq_w_neg = freq of word as label 0
    N_pos = tot num of words as label 1
    N_neg = tot num of words as label 0
    V = tot num of unique words in all the documents
    '''
    
    loglikelihood={}
    
    D_pos = len(list(filter(lambda x:x==1, train_y)))
    D_neg = len(list(filter(lambda x:x==0 , train_y)))
    D = len(train_y)
    
    P_D_pos = D_pos/D
    P_D_neg = D_neg/D
    
    logprior = np.log(P_D_pos)-np.log(P_D_neg)
    
    vocab = set(map(lambda x:x[0], freqs.keys()))
    V = len(vocab)
    
    N_pos = np.sum([val for key, val in freqs.items() if key[1]==1])
    N_neg = np.sum([val for key, val in freqs.items() if key[1]==0])
    
    for word in vocab:
        freq_w_pos = freqs.get((word,1),0)
        freq_w_neg = freqs.get((word,0),0)
        
        P_w_pos = (freq_w_pos+1)/((N_pos)+V)
        P_w_neg = (freq_w_neg+1)/((N_neg)+V)
        
        loglikelihood[word]=np.log(P_w_pos/P_w_neg)
        
    
    return logprior , loglikelihood

In [167]:
logprior,loglikelihood = train_naive_bayes(freqs,train_x,train_y)

In [168]:
def predict_naive_bayes(tweet,logprior,loglikelihood):
    
    tweet =  process_tweet(tweet)
    
    p=0
    p+=logprior
    
    for word in tweet:
        if word in loglikelihood:
            p+=loglikelihood.get(word,0)
    
    return p

In [172]:
tweet='She cry.'
predict_naive_bayes(tweet,logprior,loglikelihood)

-1.6338810936509134

In [173]:
def accuracy(test_x,test_y,logprior,loglikelihood,predict_naive_bayes=predict_naive_bayes):
    
    accuracy=0
    y_hat=[]
    for predict,test in zip(test_x,test_y):
        
        if predict_naive_bayes(predict,logprior,loglikelihood)>0:
            log=1
        else:
            log=0
        y_hat.append(log)

    error = np.mean([abs(hat-y) for hat,y in zip(y_hat,test_y)])
    
    accuracy = 1-error
    
    return accuracy

In [174]:
accuracy(test_x,test_y,logprior,loglikelihood,predict_naive_bayes=predict_naive_bayes)

0.9955

In [175]:
tweet='you are bad'
predict_naive_bayes(tweet,logprior,loglikelihood)

-1.3237261653470738