In [1]:
import nltk 
from os import getcwd

In [2]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\abhas\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [4]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples

from utils import *

In [5]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [6]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]

test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [7]:
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [8]:
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


In [9]:
freqs = build_freqs(train_x,train_y)

print("length of frequency :"+ str(len(freqs.keys())))

length of frequency :11340


In [10]:
print('This is an example of a positive tweet: \n', train_x[10])
print('\nThis is an example of the processed version of the tweet: \n', process_tweet(train_x[10]))

This is an example of a positive tweet: 
 #FollowFriday @wncer1 @Defense_gouv for being top influencers in my community this week :)

This is an example of the processed version of the tweet: 
 ['followfriday', 'top', 'influenc', 'commun', 'week', ':)']


In [11]:
def extract_features(tweet, freq):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    
    word_1 = process_tweet(tweet)
    
    #Initialize
    x = np.zeros((1,3))
    
    #bias term is set to 1
    x[0,0] = 1
    
    for word in word_1:
        x[0,1] +=  freqs[(word,1)] if (freqs.get((word,1)) != None) else 0
        x[0,2] +=  freqs[(word,0)] if (freqs.get((word,0)) != None) else 0
        
    assert(x.shape == (1,3))
    
    return x

In [12]:
tmp1 = extract_features(train_x[0], freqs)
print(tmp1)

[[1.00e+00 3.02e+03 6.10e+01]]


In [16]:
# for each and every entry in train_x, we will make 3 feature vectors ->
# first is biased, second is positive, third is negative

X = np.zeros((len(train_x),3))

for i in range(len(train_x)):
       X[i,:] = extract_features(train_x[i],freqs)

# Training labels         
Y = train_y

#Gradient Descent
alpha = 1e-9
J,theta = gradientDescent(X,Y,np.zeros((3,1)),alpha,1500)

print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24216477.
The resulting vector of weights is [7e-08, 0.0005239, -0.00055517]


In [17]:
def predict_tweet(tweet, freqs, theta):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    x = extract_features(tweet,freqs)
    
    y_pred = sigmoid(np.dot(x,theta))
    
    return y_pred

In [22]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    """
    Input: 
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """
    
    y_hat = []
    
    for tweet in test_x:
        y_pred = predict_tweet(tweet,freqs,theta)
        
        if y_pred > 0.6:
            y_hat.append(1)
        else:
            y_hat.append(0)
            
    accuracy = np.squeeze(np.sum(np.squeeze(np.asarray(y_hat)) == np.squeeze(test_y)))/len(y_hat)

  
    
    return accuracy

In [23]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.8570


In [25]:
#colors

def prRed(skk): print("\033[91m {}\033[00m" .format(skk)) 
def prGreen(skk): print("\033[92m {}\033[00m" .format(skk)) 
def prYellow(skk): print("\033[93m {}\033[00m" .format(skk)) 
def prLightPurple(skk): print("\033[94m {}\033[00m" .format(skk)) 
def prPurple(skk): print("\033[95m {}\033[00m" .format(skk)) 
def prCyan(skk): print("\033[96m {}\033[00m" .format(skk)) 
def prLightGray(skk): print("\033[97m {}\033[00m" .format(skk)) 
def prBlack(skk): print("\033[98m {}\033[00m" .format(skk))

In [27]:
# Misclassified Tweet


print('Label Predicted Tweet')
for x,y in zip(test_x,test_y):
    y_hat = predict_tweet(x, freqs, theta)

    if np.abs(y - (y_hat > 0.6)) > 0:
        prRed(('THE TWEET IS:', x))
        prGreen(('THE PROCESSED TWEET IS:', process_tweet(x)))
        prYellow(('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore'))))

Label Predicted Tweet
[91m ('THE TWEET IS:', 'Happy Friday :-) http://t.co/iymPIlWXFY')[00m
[92m ('THE PROCESSED TWEET IS:', ['happi', 'friday', ':-)'])[00m
[93m 1	0.59905059	b'happi friday :-)'[00m
[91m ('THE TWEET IS:', 'My #TeenChoice For #ChoiceinternationalArtist is #SuperJunior Fighting Oppa :D')[00m
[92m ('THE PROCESSED TWEET IS:', ['teenchoic', 'choiceinternationalartist', 'superjunior', 'fight', 'oppa', ':D'])[00m
[93m 1	0.56587068	b'teenchoic choiceinternationalartist superjunior fight oppa :D'[00m
[91m ('THE TWEET IS:', "@FindBenNeedham it's my birthday today so for my birthday wish I hope there's good news about Ben soon :-)")[00m
[92m ('THE PROCESSED TWEET IS:', ['birthday', 'today', 'birthday', 'wish', 'hope', "there'", 'good', 'news', 'ben', 'soon', ':-)'])[00m
[93m 1	0.58742115	b"birthday today birthday wish hope there' good news ben soon :-)"[00m
[91m ('THE TWEET IS:', "Good morning all :-)\n\nIt's Friday!!!!!! \U000fec00\n\nWhat are your plans for t

[93m 1	0.52095969	b"i'v type wors thing good :p"[00m
[91m ('THE TWEET IS:', 'Beat Da Beat sits well with @slashgear (includes new video) :-) http://t.co/zJmOmpx7iv #mobilegame #ios8 #Android http://t.co/LY7EnuUH8z')[00m
[92m ('THE PROCESSED TWEET IS:', ['beat', 'da', 'beat', 'sit', 'well', 'includ', 'new', 'video', ':-)'])[00m
[93m 1	0.58362536	b'beat da beat sit well includ new video :-)'[00m
[91m ('THE TWEET IS:', "I'm playing Brain Dots : ) #BrainDots\nhttp://t.co/UGQzOx0huu")[00m
[92m ('THE PROCESSED TWEET IS:', ["i'm", 'play', 'brain', 'dot', 'braindot'])[00m
[93m 1	0.48370676	b"i'm play brain dot braindot"[00m
[91m ('THE TWEET IS:', 'goodnight guys :-) \nremember tomorrow is a brand new day, a fresh start and another chance')[00m
[92m ('THE PROCESSED TWEET IS:', ['goodnight', 'guy', ':-)', 'rememb', 'tomorrow', 'brand', 'new', 'day', 'fresh', 'start', 'anoth', 'chanc'])[00m
[93m 1	0.59098476	b'goodnight guy :-) rememb tomorrow brand new day fresh start anoth ch

[91m ('THE TWEET IS:', 'Cya after a week!! :D')[00m
[92m ('THE PROCESSED TWEET IS:', ['cya', 'week', ':D'])[00m
[93m 1	0.56771389	b'cya week :D'[00m
[91m ('THE TWEET IS:', '@IstanbulPHP nice one :D')[00m
[92m ('THE PROCESSED TWEET IS:', ['nice', 'one', ':D'])[00m
[93m 1	0.56717471	b'nice one :D'[00m
[91m ('THE TWEET IS:', "@heyimbunny_ in Australia it's morning for them :D")[00m
[92m ('THE PROCESSED TWEET IS:', ['australia', 'morn', ':D'])[00m
[93m 1	0.57070881	b'australia morn :D'[00m
[91m ('THE TWEET IS:', '@mariammaslouhi If it HAD been two ~17 year olds deeply in love? :-)))')[00m
[92m ('THE PROCESSED TWEET IS:', ['two', '17', 'year', 'old', 'deepli', 'love', ':-)'])[00m
[93m 1	0.59568249	b'two 17 year old deepli love :-)'[00m
[91m ('THE TWEET IS:', 'hah....and a thousand more lies  :D https://t.co/QEil0C0auo')[00m
[92m ('THE PROCESSED TWEET IS:', ['hah', '...', 'thousand', 'lie', ':D'])[00m
[93m 1	0.55528926	b'hah ... thousand lie :D'[00m
[91m ('THE 

[91m ('THE TWEET IS:', "@neiltyson What are your thought's on Kepler 452b? :D")[00m
[92m ('THE PROCESSED TWEET IS:', ["thought'", 'kepler', '452b', ':D'])[00m
[93m 1	0.56575923	b"thought' kepler 452b :D"[00m
[91m ('THE TWEET IS:', '@1RobBeasley any progress in Stones transfer to Chelsea ?? :D')[00m
[92m ('THE PROCESSED TWEET IS:', ['progress', 'stone', 'transfer', 'chelsea', ':D'])[00m
[93m 1	0.56456205	b'progress stone transfer chelsea :D'[00m
[91m ('THE TWEET IS:', 'You always be part of me, I am part of you and defenitely...♬ @MOHDBINTANG :p')[00m
[92m ('THE PROCESSED TWEET IS:', ['alway', 'part', 'part', 'defenit', '...', '♬', ':p'])[00m
[93m 1	0.50446731	b'alway part part defenit ...  :p'[00m
[91m ('THE TWEET IS:', '@martymccarthy1 @ABCRural A great incentive to get kids to eat their Fruit &amp; Veges :-)')[00m
[92m ('THE PROCESSED TWEET IS:', ['great', 'incent', 'get', 'kid', 'eat', 'fruit', 'vege', ':-)'])[00m
[93m 1	0.57933064	b'great incent get kid eat f

[91m ('THE TWEET IS:', '@NotJagath are you a member of හෙල හවුල by any chance? :D @Chevindu')[00m
[92m ('THE PROCESSED TWEET IS:', ['member', 'හෙල', 'හවුල', 'chanc', ':D'])[00m
[93m 1	0.56431794	b'member   chanc :D'[00m
[91m ('THE TWEET IS:', 'Yo Southpaw was a GREAT movie someone better be getting an award for it :D')[00m
[92m ('THE PROCESSED TWEET IS:', ['yo', 'southpaw', 'great', 'movi', 'someon', 'better', 'get', 'award', ':D'])[00m
[93m 1	0.57600947	b'yo southpaw great movi someon better get award :D'[00m
[91m ('THE TWEET IS:', 'Last classes this morning before two week break! :-)')[00m
[92m ('THE PROCESSED TWEET IS:', ['last', 'class', 'morn', 'two', 'week', 'break', ':-)'])[00m
[93m 1	0.57631094	b'last class morn two week break :-)'[00m
[91m ('THE TWEET IS:', '@hahahakumakichi Yeah! The bird transforms into a sword. :D')[00m
[92m ('THE PROCESSED TWEET IS:', ['yeah', 'bird', 'transform', 'sword', ':D'])[00m
[93m 1	0.56405841	b'yeah bird transform sword :D'

[91m ('THE TWEET IS:', 'Match day Bitchessss !!!\n\nReal Madrid vs Man Shitty :D')[00m
[92m ('THE PROCESSED TWEET IS:', ['match', 'day', 'bitchesss', 'real', 'madrid', 'vs', 'man', 'shitti', ':D'])[00m
[93m 1	0.57099448	b'match day bitchesss real madrid vs man shitti :D'[00m
[91m ('THE TWEET IS:', 'at first I did love you, but now I just wanna fuck, late night thinking of you until I got a nut :-) :v\n\n"look... http://t.co/8YhLcb16Lf')[00m
[92m ('THE PROCESSED TWEET IS:', ['first', 'love', 'wanna', 'fuck', 'late', 'night', 'think', 'got', 'nut', ':-)', 'v', 'look', '...'])[00m
[93m 1	0.57477244	b'first love wanna fuck late night think got nut :-) v look ...'[00m
[91m ('THE TWEET IS:', '@joiredve follback :D')[00m
[92m ('THE PROCESSED TWEET IS:', ['follback', ':D'])[00m
[93m 1	0.56703820	b'follback :D'[00m
[91m ('THE TWEET IS:', "@MCunleashed :D I can't sleep until I need to. If I try I just lay in bed bored")[00m
[92m ('THE PROCESSED TWEET IS:', [':D', "can't", 's

[91m ('THE TWEET IS:', '@hswift65 @roseofthesealee PS I took 80-1 so I am hopeful on all counts :-)')[00m
[92m ('THE PROCESSED TWEET IS:', ['ps', 'took', '80-1', 'hope', 'count', ':-)'])[00m
[93m 1	0.57213368	b'ps took 80-1 hope count :-)'[00m
[91m ('THE TWEET IS:', "all i've done today is watch law &amp; order: svu. i love being sick :-)))")[00m
[92m ('THE PROCESSED TWEET IS:', ["i'v", 'done', 'today', 'watch', 'law', 'order', 'svu', 'love', 'sick', ':-)'])[00m
[93m 1	0.58404379	b"i'v done today watch law order svu love sick :-)"[00m
[91m ('THE TWEET IS:', '@BLACKTOGXLD my pleasure :D enjoy your day!!')[00m
[92m ('THE PROCESSED TWEET IS:', ['pleasur', ':D', 'enjoy', 'day'])[00m
[93m 1	0.58097984	b'pleasur :D enjoy day'[00m
[91m ('THE TWEET IS:', 'As the morning wears on, its getting darker. Is it end of world day today instead of September? :-)')[00m
[92m ('THE PROCESSED TWEET IS:', ['morn', 'wear', 'get', 'darker', 'end', 'world', 'day', 'today', 'instead', 'sept

In [28]:
my_tweet = 'This was one of the worst experience that I have encountered. Iam filled with melancholy'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['one', 'worst', 'experi', 'encount', 'iam', 'fill', 'melancholi']
[[0.49503899]]
Negative sentiment
