In [22]:
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from collections import defaultdict
from nltk.stem import PorterStemmer

In [17]:
filePath = f"{getcwd()}/data/"
nltk.data.path.append(filePath)

## read tweets

In [18]:

# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [19]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hezardastan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/hezardastan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/hezardastan/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/hezardastan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw to /home/hezardastan/nltk_data...
[nltk_data]   Unzipping corpora/omw.zip.


True

In [23]:
# tags needed for lemmatization
tag_dict = defaultdict(lambda : wordnet.NOUN)
tag_dict['J'] = wordnet.ADJ
tag_dict['V'] = wordnet.VERB
tag_dict['R'] = wordnet.ADV

def process_tweet(text):
    
    tweet = text.lower()
    tweet = re.sub(r'[0-9]+', '', tweet)
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    tweet = tweet.translate((None, string.punctuation))
    
  
    #tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    stopwords_english = stopwords.words('english')
    tweet = [word for word in tweet_tokens if word not in stopwords_english]
    stemmer = PorterStemmer()
    tweets_clean = []
    
    lemmatizer=WordNetLemmatizer()
    lemmatize_text = []
    for token, tag in pos_tag(tweet):
        word = lemmatizer.lemmatize(token, tag_dict[tag[0]])
        stem_word = stemmer.stem(word)  # stemming word
        if(len(stem_word)>1):
            tweets_clean.append(stem_word)
    
    return tweets_clean

In [24]:

custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))

['rt', 'hello', 'great', 'day', ':)', 'good', 'morn']


In [26]:
def count_tweets(result, tweets, ys):

    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            pair = (word,y)

            if pair in result:
                result[pair] += 1

            else:
                result[pair] = 1

    return result

In [27]:
freqs = count_tweets({}, train_x, train_y)

## count unique words

In [28]:
vocab = set([pair[0] for pair in freqs.keys()])# some words are in both negative an positive sentences
V = len(freqs)

## number of all positive word and negative word with repetition

In [34]:
# calculate N_pos, N_neg, V_pos, V_neg
N_pos = N_neg = V_pos = V_neg = 0
for pair in freqs.keys():
    # if the label is positive (greater than zero)
    if pair[1] > 0:
        # increment the count of unique positive words by 1
        V_pos += 1

        # Increment the number of positive words by the count for this (word, label) pair
        N_pos += freqs[pair]

    # else, the label is negative
    else:
        # increment the count of unique negative words by 1
        V_neg += 1

        # increment the number of negative words by the count for this (word,label) pair
        N_neg += freqs[pair]

## compute log prior. number of positive sentence on number of negative one

In [42]:
#Calculate D, the number of documents
D = len(train_y)

# Calculate D_pos, the number of positive documents
D_pos = (len(list(filter(lambda x: x > 0, train_y))))

# Calculate D_neg, the number of negative documents
D_neg = (len(list(filter(lambda x: x <= 0, train_y))))

# Calculate logprior
logprior = np.log(D_pos) - np.log(D_neg)

## compute logliklihood

In [35]:
def lookup(freqs, word, label):
    n = 0
    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [37]:
loglikelihood = {}
for word in vocab:
    # get the positive and negative frequency of the word
    freq_pos = lookup(freqs,word,1)
    freq_neg = lookup(freqs,word,0)

    # calculate the probability that each word is positive, and negative
    p_w_pos = (freq_pos + 1) / (N_pos + V)
    p_w_neg = (freq_neg + 1) / (N_neg + V)

    # calculate the log likelihood of the word
    loglikelihood[word] = np.log(p_w_pos/p_w_neg)

In [38]:
loglikelihood

{'drainag': 0.6920596631988825,
 'dept': -0.0010875173610629006,
 'dh': -1.0996998060291727,
 'sunflow': 1.0975247713070468,
 'working-on-a-tight-schedul': -0.6942346979210082,
 'rogerwatch': -0.6942346979210082,
 'kha': 0.6920596631988825,
 'intoler': -0.6942346979210082,
 'rodfanta': -0.6942346979210082,
 'cafe': -1.0996998060291727,
 'custard': 0.6920596631988825,
 'comput': -0.1834090741550175,
 'aisyah': -0.6942346979210082,
 'reassur': 0.6920596631988825,
 'philippin': 1.0975247713070468,
 'suav': 0.6920596631988825,
 'mesh': -0.6942346979210082,
 'gyllenha': -0.6942346979210082,
 'age': -1.0996998060291725,
 'aja': 0.6920596631988825,
 'cheerio': 0.6920596631988825,
 'compens': 0.6920596631988825,
 'wi̇ll': -3.584606455817173,
 'away': -0.6516750835022123,
 'rubber': 1.3852068437588276,
 'fucjikg': -0.6942346979210082,
 'ukrain': 0.6920596631988825,
 'cousin': -0.0010875173610629006,
 'eu': -0.6942346979210082,
 'awami': -0.6942346979210082,
 'protein': -0.6942346979210082,
 'بن

## prediction

In [39]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = process_tweet(tweet)
    p = 0
    p += logprior
    for word in word_l:
        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p

In [40]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):

    accuracy = 0  # return this properly
    y_hats = []
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0

        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.mean(np.absolute(y_hats-test_y))

    # Accuracy is 1 minus the error
    accuracy = 1-error


    return accuracy


In [43]:

print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9935
