In [1]:
# import libraries

import random
import pandas as pd
import numpy as np
import pickle
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
# read dataset

raw_data = pd.read_csv("A2_data.csv")
raw_data.head()

Unnamed: 0,LABEL,DATE_TIME,TEXT
0,0,Fri Jun 05 14:26:50 2009,About to get threaded and scared
1,1,Thu May 14 10:13:55 2009,@awaisnaseer I like Shezan Mangooo too!!! I ha...
2,1,Fri Jun 05 21:02:20 2009,worked on my car after work. showering then go...
3,1,Sun Jun 14 22:25:52 2009,@Marama Actually we start this afternoon! I w...
4,1,Sun May 31 00:42:12 2009,@gfalcone601 Aww Gi.don't worry.we'll vote for...


In [3]:
# check count of each labels in dataset
raw_data.LABEL.value_counts()

1    2287
0    2000
Name: LABEL, dtype: int64

### The data is a bit skewed towards the positive class

## Preprocessing

In [4]:
# import libraries

import nltk
from nltk.stem.snowball import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from autocorrect import Speller


# create object for lemmetizer and spelling checking

lemmatizer = WordNetLemmatizer()
spell = Speller(lang='en')

def preprocess(df):
    
    # create required columns and initialize with 0
    
    df['white_space_removed'] = 0
    df['tokenized_data'] = 0
    df['stopword_removed_data'] = 0
    df['punct_removed_data'] = 0
    df['url_removed_data'] = 0
    df['spelling_checked_data'] = 0
    df['lemmetized_data'] = 0
    df['preprocessed_txt'] = 0
    
    # iterate over each row of dataset and preprocess data
    sentences
    for i in range(df.shape[0]):

        # white space removel
        
        df['white_space_removed'][i] = re.sub("\s+", " ", df.TEXT[i])


        # tokenization
        
        lower = df['white_space_removed'][i].lower()
        tokenized_data = word_tokenize(lower)
        df['tokenized_data'][i] = tokenized_data
        
        

        # remove stopwords
        
        stop_words = "|".join(stopwords.words('english'))
        pattern = re.compile(r'\b(' + stop_words + r')\b\s*')
        stopword_removed_data = [pattern.sub("", text) for text in tokenized_data]
        stopword_removed_data = [x for x in stopword_removed_data if x]
        
        df['stopword_removed_data'][i] = stopword_removed_data         
       

        # punctuation removel
        
        punct_removed_data = [x for x in stopword_removed_data if x.isalnum()]
        df['punct_removed_data'][i] = punct_removed_data



        # remove urls and html tags
        
        urls = re.findall("https?://[a-zA-Z0-9_\?=\@\/#=.~-]+", " ".join(punct_removed_data))
        url_removed_data = [x for x in punct_removed_data if x not in urls]
        df['url_removed_data'][i] = url_removed_data
 


        # spelling checking
    
        spelling_checked_data = [spell(x) for x in url_removed_data]
        df['spelling_checked_data'][i] = spelling_checked_data


        # lemmetization
        
        lemmas = []
        for w in spelling_checked_data:
            lemmas.append(lemmatizer.lemmatize(w, wordnet.VERB))
            df['lemmetized_data'][i] = lemmas
        
        if(str(df['lemmetized_data'][i]) == '0'):
            df.drop(i)
        else:
            df['preprocessed_txt'][i] = " ".join(df['lemmetized_data'][i])
            
        x = df['preprocessed_txt'][i]
        if(type(x)!='str'):
            x=str(x)
        x = '<s> ' + x + ' </s>'
        df['preprocessed_txt'][i]=x    
        
    print("PREPROCESSING PIPELINE AT A GLANCE")    
    display(df.head())
    df = df[['LABEL','DATE_TIME','preprocessed_txt']]
                
    return df

In [6]:
raw_data_copy_processed = preprocess(raw_data.copy())
print("PREPROCESSED DATA")
display(raw_data_copy_processed.head())

In [None]:
raw_data_copy_processed.to_csv('A2_dataset_processed.csv',encoding='utf-8-sig', index=False)

## Q1- laplace smoothing  transform

In [5]:
# read preprocessed data
processed_data = pd.read_csv('A2_dataset_processed.csv')


#count unigram frequencies and prepare vocabulary set
vocab = set()
unigram_count = {}
    

for x in processed_data['preprocessed_txt'].to_list():    
    for y in x.split():
        vocab.add(y)
        if y in unigram_count:
            unigram_count[y] += 1
        else:
            unigram_count[y] = 1

vocab = list(vocab)
print("length of vocabulary :", len(vocab))

length of vocabulary : 7349


In [6]:
#count bigram frequencies of only existing bigrams; rest are trivially 0
bigram_count={}      
            
            
for line in processed_data.preprocessed_txt.to_list():
            list_words = line.split()
            for k in range(len(list_words)-1):
                i = list_words[k]
                j = list_words[k+1]
                if(i,j) in bigram_count:
                    bigram_count[(i,j)] += 1
                else:
                    bigram_count[(i,j)] = 1                
                
list(bigram_count)[:5]

[('<s>', 'get'),
 ('get', 'thread'),
 ('thread', 'scar'),
 ('scar', '</s>'),
 ('<s>', 'awaisnaseer')]

In [None]:
# calculate updated bigram probability using laplace transform over entire dataset and print top 4 
v = vocab
vocab_size = len(vocab)
lap_bigram_probs = {}

for i in v:
    for j in v:
        if(i,j) in bigram_count:
            lap_bigram_probs[(i,j)] = (bigram_count[(i,j)] + 1)/(unigram_count[i] + vocab_size) 
        else:
            lap_bigram_probs[(i,j)] = 1/(unigram_count[i] + vocab_size)
            
with open('lap_bigram_probs.pickle', 'wb') as handle:
    pickle.dump(lap_bigram_probs, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
del lap_bigram_probs

In [7]:
# read laplace probability of bigram from pickle 
with open('lap_bigram_probs.pickle', 'rb') as handle:
    lap_bigram_probs = pickle.load(handle)   

# print top 4 bigrams

In [8]:
from collections import Counter

print("top 4 bigrams")
for (k, v) in Counter(lap_bigram_probs).most_common(4):            
    print((k, v))

top 4 bigrams
(('http', '</s>'), 0.014073287307488051)
(('lol', '</s>'), 0.008906021533962515)
(('gon', 'na'), 0.00823322985558105)
(('day', '</s>'), 0.006479767257339328)


## Next word generation

In [9]:
# generate most probable next word after 'context'
def next_word(context):   
    r = random.random()
    map_to_probs = {}
    
    # prob. of occurrence of each word 'token' after 'context'
    for token in vocab:
        map_to_probs[token] = lap_bigram_probs[(context, token)]    
    
    # summ stores cumulative probabilities of occurrence of a word after a 'context' 
    summ = 0  
    for token in (map_to_probs):
        summ += map_to_probs[token] 
        if (summ > r):
            return token

## Sentance generation

In [10]:
# generate sentences
def generate_text(): 
        minn = 7
        maxx = 20
        context_queue = '<s>' #current last word seen/generated; sent as 'context' to next_word() function
        result = ['<s>']  #entire sentence generated upto present time 
        
        c = 1
        while  c <= maxx :
            obj = next_word(context_queue)
            if obj == '</s>' and c <= minn:
                continue
            elif obj == '</s>' and c > minn:
                break
            else: 
                context_queue = obj
                result.append(obj)
                c += 1
                            
        result.append('</s>')
        return ' '.join(result)
    
generate_text()

'<s> tashadhanraj lathe teddy underwood wake mat dun confessing7girl julibarcelona cigarettes johngreenaway drunken uch jonbecker infinity corner satisfy store presentations wise </s>'

## genrate 500 sentences using smoothed probabilities

In [11]:
sentences_500_no_beta = []
for i in range(500):
    sentences_500_no_beta.append(generate_text())

#  print first 5 generated sentences
for s in sentences_500_no_beta[:5]:
    print(s)
    
del lap_bigram_probs

<s> good severe marleematlin kreesha mind night nj deeper toast list geog yasmimmm cameronreilly excitement backlog lord 3rd 1week suck harper </s>
<s> chopsuey2e zhang political fabuleuxdestin afterwards bad jeremy poke sowwiiiee tilde chick r2e2 yan dif super nareejo semuuaa afternoon pc uugggh </s>
<s> mosquito 4am lobster top vernongarrett calgary greatfitness knowwwwww mc insure shogi 5th benny doingwork l8ly popularity normally ncaa tweak costa </s>
<s> okay dunkndisorderly tutor mommapuff souvenir sketchbook mwahahaha financial goodnight jenny hoopinispassion howliet sudden kea34 sri join60seconds offline shaft secular matthewsheppard </s>
<s> lucypope hoaaaaaaaaaaaaa caffeine mark liturgy wheel ceiling toronto dow werewolfseth m0t0breath h0area getknifed yourproxycomm product 45 affairs solar funky tent </s>


In [12]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


scores = []
mod_scores = []
labels = []


def sentiment_scores(sentence):
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    
    sentiment_dict = sid_obj.polarity_scores(sentence) 
    scores.append(sentiment_dict['compound'])
    mod_scores.append(abs(sentiment_dict['compound']))
    
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] > 0 :
        labels.append(1)
    else:
        labels.append(0)
        
        
for s in sentences_500_no_beta:
    sentiment_scores(s) 

## Save generated sentances

In [13]:
len(labels)

500

In [14]:
df_no_beta = pd.DataFrame({'sentences': sentences_500_no_beta, 'labels': labels, 'vader_scores': scores, 'mod_vader_scores': mod_scores})
display(df_no_beta.head(5))
df_no_beta.to_csv('df_no_beta.csv', index=False)

Unnamed: 0,sentences,labels,vader_scores,mod_vader_scores
0,<s> good severe marleematlin kreesha mind nigh...,1,0.1531,0.1531
1,<s> chopsuey2e zhang political fabuleuxdestin ...,1,0.1027,0.1027
2,<s> mosquito 4am lobster top vernongarrett cal...,1,0.5994,0.5994
3,<s> okay dunkndisorderly tutor mommapuff souve...,1,0.1027,0.1027
4,<s> lucypope hoaaaaaaaaaaaaa caffeine mark lit...,0,-0.1027,0.1027


In [15]:
avg_vader_score_no_beta = sum(mod_scores) / len(mod_scores)
print("average vader score without beta : ", avg_vader_score_no_beta)

average vader score without beta :  0.3616596


In [16]:
temp = df_no_beta.loc[df_no_beta.labels == 1]
temp.shape

(223, 4)

## Bigram with Beta computation for positive sentences

In [17]:
# unigram counts:  for datasets with only positive labels

vocab1 = set()
unigram_count1={}


for x in processed_data['preprocessed_txt'][processed_data.LABEL == 1].to_list():    
    for y in x.split():
        vocab1.add(y)
        if y in unigram_count1:
            unigram_count1[y] += 1
        else:
            unigram_count1[y] = 1

            
print("length of postive vocab is :", len(vocab1))
vocab1 = list(vocab1)
processed_data[processed_data.LABEL == 1].head()

length of postive vocab is : 5039


Unnamed: 0,LABEL,DATE_TIME,preprocessed_txt
1,1,Thu May 14 10:13:55 2009,<s> awaisnaseer like sedan mango one yesterday...
2,1,Fri Jun 05 21:02:20 2009,<s> work car work show go bed sooooooooooo tir...
3,1,Sun Jun 14 22:25:52 2009,<s> drama actually start afternoon try somethi...
4,1,Sun May 31 00:42:12 2009,<s> falcon601 www vote col love much </s>
5,1,Sun May 17 03:26:30 2009,<s> mrstessyman ever good day love knitpicks </s>


In [18]:
# count bigram frequencies of only existing bigrams for positive label sentences; rest are trivially 0
bigram_count1 = {}      
            
            
for line in processed_data[processed_data.LABEL == 1].preprocessed_txt.to_list():
            list_words = line.split()
            for k in range(len(list_words)-1):
                i = list_words[k]
                j = list_words[k+1]
                if(i,j) in bigram_count1:
                    bigram_count1[(i,j)] += 1
                else:
                    bigram_count1[(i,j)] = 1                
                
list(bigram_count1.items())[:5]

[(('<s>', 'awaisnaseer'), 1),
 (('awaisnaseer', 'like'), 1),
 (('like', 'sedan'), 1),
 (('sedan', 'mango'), 1),
 (('mango', 'one'), 1)]

In [54]:
#calculate updated bigram probability using laplace transform over entire dataset
length = len(vocab1)
p_bigrams_lap1={}


for i in vocab1:
    for j in vocab1:
        if (i,j) in bigram_count1:
            p_bigrams_lap1[(i,j)] = (bigram_count1[(i,j)]+1) / (unigram_count1[i] + length) 
        else:
            p_bigrams_lap1[(i,j)] = 1/(unigram_count1[i]+length)
            
list(p_bigrams_lap1.items())[:5]

[(('eqfacebookgrp', 'eqfacebookgrp'), 0.0001984126984126984),
 (('eqfacebookgrp', 'computer'), 0.0001984126984126984),
 (('eqfacebookgrp', 'noone'), 0.0001984126984126984),
 (('eqfacebookgrp', 'essay'), 0.0001984126984126984),
 (('eqfacebookgrp', 'ncaa'), 0.0001984126984126984)]

In [55]:
with open('p_bigrams_lap1.pickle', 'wb') as handle:
    pickle.dump(p_bigrams_lap1, handle, protocol=pickle.HIGHEST_PROTOCOL)

del p_bigrams_lap1

with open('p_bigrams_lap1.pickle', 'rb') as handle:
    p_bigrams_lap1 = pickle.load(handle)

In [64]:
# calculating smoothed bigram prob including beta
p_bigrams_beta1 = {}


for i in vocab:
    for j in vocab:
        if (i,j) not in bigram_count and (i,j) not in bigram_count1 and i not in unigram_count1:
            p_bigrams_beta1[(i,j)] = ((0 + 1 + 5*0) / (unigram_count[i] + length + 5*0))
        elif (i,j) in bigram_count and (i,j) not in bigram_count1 and i not in unigram_count1:
            p_bigrams_beta1[(i,j)] = ((bigram_count[(i,j)] + 1 + 5*0) / 
                                     (unigram_count[i] + length + 5*0))      
        elif (i,j) not in bigram_count and (i,j) not in bigram_count1 and i in unigram_count1:
            p_bigrams_beta1[(i,j)] = ((0 + 1 + 5*0) / 
                                     (unigram_count[i] + length + 5*unigram_count1[i]))
        elif (i,j) in bigram_count and (i,j) not in bigram_count1 and i in unigram_count1:
            p_bigrams_beta1[(i,j)] = ((bigram_count[(i,j)] + 1 + 5*0) / 
                                     (unigram_count[i] + length + 5*unigram_count1[i]))
        elif (i,j) in bigram_count and (i,j) in bigram_count1 and i in unigram_count1:
            p_bigrams_beta1[(i,j)] = ((bigram_count[(i,j)] + 1 + 5*bigram_count1[(i,j)]) / 
                                     (unigram_count[i] + length + 5*unigram_count1[i]))

In [21]:
with open('p_bigrams_beta1.pickle', 'wb') as handle:
    pickle.dump(p_bigrams_beta1, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
del p_bigrams_beta1

# # for i in range(10):
# #     print(i)

with open('p_bigrams_beta1.pickle', 'rb') as handle:
    p_bigrams_beta1 = pickle.load(handle)

## Positive sentences generation

In [65]:
def next_word(context):#generate most probable next word after 'context'
    r=random.random()
    map_to_probs = {}
    
    for token in vocab:
        map_to_probs[token] = p_bigrams_beta1[(context, token)] #prob. of occurrence of each word 'token' after 'context'

    summ = 0  #summ stores cumulative probabilities of occurrence of a word after a 'context' 
    for token in (map_to_probs):
        summ += map_to_probs[token] 
        if (summ > r):
            return token


def generate_text(): #returns 1 generated sentence

        minn=7
        maxx=20
        context_queue = '<s>' #current last word seen/generated; sent as 'context' to next_word() function
        result = ['<s>']  #entire sentence generated upto present time 
        
        c = 1
        while  c<=maxx :
            obj = next_word(context_queue)
            if obj == '</s>' and c <= minn:
                continue
            elif obj == '</s>' and c > minn:
                break
            else: 
                context_queue=obj
                result.append(obj)
                c += 1
                            
        result.append('</s>')
        return ' '.join(result)
    

#genrate 500 sentences using smoothed probabilities with beta positive
sentences_beta1 = []
for i in range(500):
    sentences_beta1.append(generate_text())

In [70]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

scores_beta1 = []
mod_scores_beta1 = []
labels_beta1 = []


def sentiment_scores(sentence):
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    
    sentiment_dict = sid_obj.polarity_scores(sentence) 
    scores_beta1.append(sentiment_dict['compound'])
    mod_scores_beta1.append(abs(sentiment_dict['compound']))
    
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0 :
        labels_beta1.append(1)
    else:
        labels_beta1.append(0)
        
        
for s in sentences_beta1:
    sentiment_scores(s)

## print 5 positive  sentences

In [71]:
for s in sentences_beta1[:5]:
    print(s)
    print()

<s> sometimes psp 14 kudos therealpickler handyman 1st whip strobelight sprinters wickets mya flint launch emilyybrowningg lisaworld kingdom domingo roxannegregorio doreenatdms </s>

<s> know transformers conscious italian natmcb78 chesterfield aurea officials cig muster brightside jajjaja friend elon drew123 cream mrstessyman mga springleaf anoopdoggdesai </s>

<s> festivallights ijustmightendit grizzly flight eu myinnersexygirl rotten demivenom ahmed nikkithebee cleftmommy0217 indrairwan jk boyhood êµ msrnbjazz picture lad oxo 6 </s>

<s> diversitybgt ruthramirez diversitybgt hometown littlebites bonnaroo shrew fact academia knackered ergo clever uu tomsmithmcse nin morrison sl rt announcements alas </s>

<s> oh gallery sport grey district factory yeh american disneys pilot apply ì le cheese stem burn honorary realdeal32 shallow section </s>



In [72]:
df_beta1 = pd.DataFrame({'sentences with beta positive': sentences_beta1, 'labels': labels_beta1, 'vader_scores': scores_beta1, 'mod_vader_scores': mod_scores_beta1})
display(df_beta1.head(5))

df_beta1.to_csv('df_beta1')

df_beta1 = pd.read_csv('df_beta1')
mod_scores_beta1 = df_beta1.mod_vader_scores

del p_bigrams_beta1

Unnamed: 0,sentences with beta positive,labels,vader_scores,mod_vader_scores
0,<s> sometimes psp 14 kudos therealpickler hand...,1,0.5106,0.5106
1,<s> know transformers conscious italian natmcb...,1,0.4939,0.4939
2,<s> festivallights ijustmightendit grizzly fli...,0,-0.34,0.34
3,<s> diversitybgt ruthramirez diversitybgt home...,1,0.0,0.0
4,<s> oh gallery sport grey district factory yeh...,1,0.3818,0.3818


average vader score for positive sentences :  0.38994379999999984


In [73]:
avg_vader_score_beta1 = sum(mod_scores_beta1) / len(mod_scores_beta1)
print("average vader score for positive sentences : ", avg_vader_score_beta1)

average vader score for positive sentences :  0.38994379999999984


## Negative sentance generation

In [76]:
df2 = processed_data.loc[processed_data.LABEL == 0]
display(df2.head())

Unnamed: 0,LABEL,DATE_TIME,preprocessed_txt
0,0,Fri Jun 05 14:26:50 2009,<s> get thread scar </s>
9,0,Wed Jun 17 09:18:19 2009,<s> need shake gloomy feel maybe rain </s>
10,0,Mon Jun 22 13:51:56 2009,<s> minecraft ride sarah still afraid ride any...
12,0,Fri May 22 00:37:02 2009,<s> sokendrakouture yea alone </s>
18,0,Thu May 21 23:50:48 2009,<s> flyingbolt good without </s>


## Bigram with beta computation for negative sentences

In [78]:
# unigram counts for negative sentences 

v2 = set()
unigram_count2 = {}

for x in df2['preprocessed_txt'].to_list():
    for y in x.split():
        v2.add(y)
        if y in unigram_count2:
            unigram_count2[y]+=1
        else:
            unigram_count2[y]=1

print("length of vocabuary in negative sentances : ", len(v2))
v2 = list(v2)

length of vocabuary in negative sentances :  4217


In [80]:
#count bigram frequencies of only existing bigrams for negative label sentences

bigram_count2 = {}
            
for line in df2.preprocessed_txt.to_list():
            list_words = line.split()
            for k in range(len(list_words)-1):
                i=list_words[k]
                j=list_words[k+1]
                if(i,j) in bigram_count2:
                    bigram_count2[(i,j)]+=1
                else:
                    bigram_count2[(i,j)]=1                
                
print(list(bigram_count2.items())[:5])

[(('<s>', 'get'), 28), (('get', 'thread'), 1), (('thread', 'scar'), 1), (('scar', '</s>'), 2), (('<s>', 'need'), 9)]


In [87]:
# calculating smoothed bigram prob including beta
length = len(vocab)
p_bigrams_beta2 = {}


for i in vocab:
    for j in vocab:
        if (i,j) not in bigram_count and (i,j) not in bigram_count2 and i not in unigram_count2:
            p_bigrams_beta2[(i,j)] = ((0 + 1 + 3*0) / (unigram_count[i] + length + 3*0))
        elif (i,j) in bigram_count and (i,j) not in bigram_count2 and i not in unigram_count2:
            p_bigrams_beta2[(i,j)] = ((bigram_count[(i,j)] + 1 + 3*0) / 
                                     (unigram_count[i] + length + 3*0))        
        elif (i,j) not in bigram_count and (i,j) not in bigram_count2 and i in unigram_count2:
            p_bigrams_beta2[(i,j)] = ((0 + 1 + 3*0) / 
                                     (unigram_count[i] + length + 3*unigram_count2[i]))
        elif (i,j) in bigram_count and (i,j) not in bigram_count2 and i in unigram_count2:
            p_bigrams_beta2[(i,j)] = ((bigram_count[(i,j)] + 1 + 3*0) / 
                                     (unigram_count[i] + length + 3*unigram_count2[i]))
        elif (i,j) in bigram_count and (i,j) in bigram_count2 and i in unigram_count2:
            p_bigrams_beta2[(i,j)] = ((bigram_count[(i,j)] + 1 + 3*bigram_count2[(i,j)]) / 
                                     (unigram_count[i] + length + 3*unigram_count2[i]))        


list(p_bigrams_beta2.items())[:5]

[(('eqfacebookgrp', 'eqfacebookgrp'), 0.00013605442176870748),
 (('eqfacebookgrp', 'canal'), 0.00013605442176870748),
 (('eqfacebookgrp', 'computer'), 0.00013605442176870748),
 (('eqfacebookgrp', 'noone'), 0.00013605442176870748),
 (('eqfacebookgrp', 'essay'), 0.00013605442176870748)]

In [90]:
with open('p_bigrams_beta2.pickle', 'wb') as handle:
    pickle.dump(p_bigrams_beta2, handle, protocol=pickle.HIGHEST_PROTOCOL)

del p_bigrams_beta2

with open('p_bigrams_beta2.pickle', 'rb') as handle:
    p_bigrams_beta2 = pickle.load(handle)

In [92]:
import random

def next_word(context):#generate most probable next word after 'context'
    r=random.random()
    map_to_probs = {}
    
    for token in vocab:
        map_to_probs[token] = p_bigrams_beta2[(context, token)] #prob. of occurrence of each word 'token' after 'context'

    summ = 0  #summ stores cumulative probabilities of occurrence of a word after a 'context' 
    for token in (map_to_probs):
        summ += map_to_probs[token] 
        if (summ > r):
            return token


def generate_text(): #returns 1 generated sentence

        minn=7
        maxx=20
        context_queue = '<s>' #current last word seen/generated; sent as 'context' to next_word() function
        result = ['<s>']  #entire sentence generated upto present time 
        
        c = 1
        while  c<=maxx :
            obj = next_word(context_queue)
            if obj == '</s>' and c <= minn:
                continue
            elif obj == '</s>' and c > minn:
                break
            else: 
                context_queue=obj
                result.append(obj)
                c += 1
                            
        result.append('</s>')
        return ' '.join(result)
    

#genrate 500 sentences using smoothed probabilities with beta positive
sentences_beta2 = []
for i in range(500):
    sentences_beta2.append(generate_text())

## Print 5 negative sentences

In [95]:
for s in sentences_beta2[:5]:
    print(s)
    print()

<s> asiabrands divine ashdonaldson cornell highway ramsay begin hoaaaaaaaaaaaaa bookshelf video midsummers acid pie demerol rain invite disco janjohannesson 5pm umahameed </s>

<s> allies0r amazon 28 alexis mouth medication abbr depression water iran staaceeyy single johngreenaway upto tacit hurrrrry nhoustonreed sorta machine two </s>

<s> ubertwitter bambino woodland carve 5th orissa545 sweetteach81 nelldamylf backup234 classmates teachers mccormicks technique florida s5 az hexmurda richard serena 2call </s>

<s> test deadlines amount philgerb mandatory fear draft hardheaded supp dam dot year paddypower hamilton sherinegamal zombie celeb divatheriva 20 croissant </s>

<s> crap march actor mercy tolerance jazzyyyyyy 5000000 mummmyyysss video schooooool kimscriven wine stream abandon keithmelton99 plaintruthiness goodluck harper rebound liverpool </s>



In [96]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

scores_beta2 = []
mod_scores_beta2 = []
labels_beta2 = []

def sentiment_scores(sentence):
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    sentiment_dict = sid_obj.polarity_scores(sentence) 
    scores_beta2.append(sentiment_dict['compound'])
    mod_scores_beta2.append(abs(sentiment_dict['compound']))
    
    # decide sentiment as positive and negative
    if sentiment_dict['compound'] > 0 :
        labels_beta2.append(1)
    else:
        labels_beta2.append(0)
        
        
for s in sentences_beta2:
    sentiment_scores(s)

In [106]:
df_beta2 = pd.DataFrame({'sentences with beta negative': sentences_beta2, 'labels': labels_beta2, 'vader_scores': scores_beta2, 'mod_vader_scores': mod_scores_beta2})
display(df_beta2.head(5))

df_beta2.to_csv('df_beta2.csv', index=False)
df_beta2 = pd.read_csv('df_beta2')

Unnamed: 0,sentences with beta negative,labels,vader_scores,mod_vader_scores
0,<s> asiabrands divine ashdonaldson cornell hig...,1,0.6369,0.6369
1,<s> allies0r amazon 28 alexis mouth medication...,0,-0.4588,0.4588
2,<s> ubertwitter bambino woodland carve 5th ori...,0,0.0,0.0
3,<s> test deadlines amount philgerb mandatory f...,0,-0.4404,0.4404
4,<s> crap march actor mercy tolerance jazzyyyyy...,0,-0.2023,0.2023


In [101]:
mod_scores_beta2 = df_beta2.mod_vader_scores
avg_vader_score_beta2 = sum(mod_scores_beta2) / len(mod_scores_beta2)
print("average vader score for negative sentences :", avg_vader_score_beta2)

del p_bigrams_beta2

average vader score for negative sentences : 0.3784063999999999


## Data preparation for training

In [22]:
df_beta1 = pd.read_csv('df_beta1')
df_beta1.drop(['Unnamed: 0'], axis=1, inplace=True)
df_beta1 = df_beta1.sort_values('mod_vader_scores', ascending=False)
df_beta1.rename(columns={'sentences with beta positive': 'sentences with beta'}, inplace=True)
display(df_beta1.head())
df_beta_pos_250 = df_beta1.iloc[:250, :]
df_beta_pos_250.shape

Unnamed: 0,sentences with beta,labels,vader_scores,mod_vader_scores
480,<s> davidarchie unfold belcourt dufalbagent bi...,1,0.9133,0.9133
232,<s> free lilyjang another ppl mcmahon shiny be...,1,0.9022,0.9022
316,<s> slowinagoodway incentive mvc2 nikkithebee ...,1,0.8834,0.8834
66,<s> im plzzzz cautious mandatory wtf pinchmysa...,0,-0.8807,0.8807
376,<s> pdxsays lucypope respectful vinsharma mast...,1,0.875,0.875


(250, 4)

In [23]:
df_beta2 = pd.read_csv('df_beta2')
df_beta2 = df_beta2.sort_values('mod_vader_scores', ascending=False)
df_beta2.rename(columns={'sentences with beta negative': 'sentences with beta'}, inplace=True)
display(df_beta2.head())
df_beta_neg_250 = df_beta2.iloc[:250, :]
df_beta_neg_250.shape

Unnamed: 0,sentences with beta,labels,vader_scores,mod_vader_scores
329,<s> sukebeuchujin maryyyyyy shittiest pdxsays ...,0,-0.9274,0.9274
348,<s> sideways dick wan matchesmalone theater mo...,0,-0.9201,0.9201
199,<s> iamsneezy prepare october die poker jrklov...,0,-0.9022,0.9022
385,<s> trouble isp teachers cloud leosgifted1 che...,0,-0.8779,0.8779
138,<s> lx mpt unite saint ne gamesurge star respe...,1,0.875,0.875


(250, 4)

## concate sentances of positive and negative

In [24]:
df_beta = pd.concat([df_beta_pos_250, df_beta_neg_250])
df_beta.head()

Unnamed: 0,sentences with beta,labels,vader_scores,mod_vader_scores
480,<s> davidarchie unfold belcourt dufalbagent bi...,1,0.9133,0.9133
232,<s> free lilyjang another ppl mcmahon shiny be...,1,0.9022,0.9022
316,<s> slowinagoodway incentive mvc2 nikkithebee ...,1,0.8834,0.8834
66,<s> im plzzzz cautious mandatory wtf pinchmysa...,0,-0.8807,0.8807
376,<s> pdxsays lucypope respectful vinsharma mast...,1,0.875,0.875


In [25]:
df_beta.shape

(500, 4)

In [113]:
mod_vader_scores = df_beta.mod_vader_scores
avg_vader_score_beta = sum(mod_vader_scores) / len(mod_vader_scores)
print("average vader score after concatination oof postive and negativee sentences :", avg_vader_score_beta)

average vader score after concatination oof postive and negativee sentences : 0.6119588000000001


In [26]:
display(raw_data.head())
raw_data.drop(['DATE_TIME'], axis=1, inplace=True)
raw_data.rename(columns={'preprocessed_txt': 'sentences'}, inplace=True)
raw_data.head()

Unnamed: 0,LABEL,DATE_TIME,TEXT
0,0,Fri Jun 05 14:26:50 2009,About to get threaded and scared
1,1,Thu May 14 10:13:55 2009,@awaisnaseer I like Shezan Mangooo too!!! I ha...
2,1,Fri Jun 05 21:02:20 2009,worked on my car after work. showering then go...
3,1,Sun Jun 14 22:25:52 2009,@Marama Actually we start this afternoon! I w...
4,1,Sun May 31 00:42:12 2009,@gfalcone601 Aww Gi.don't worry.we'll vote for...


Unnamed: 0,LABEL,TEXT
0,0,About to get threaded and scared
1,1,@awaisnaseer I like Shezan Mangooo too!!! I ha...
2,1,worked on my car after work. showering then go...
3,1,@Marama Actually we start this afternoon! I w...
4,1,@gfalcone601 Aww Gi.don't worry.we'll vote for...


In [89]:
df_beta.head()
temp = df_beta.iloc[:, :2].copy()
temp.rename(columns={'labels': 'LABEL', 'sentences with beta': 'sentences'}, inplace=True)
temp.head()

Unnamed: 0,sentences,LABEL
480,<s> davidarchie unfold belcourt dufalbagent bi...,1
232,<s> free lilyjang another ppl mcmahon shiny be...,1
316,<s> slowinagoodway incentive mvc2 nikkithebee ...,1
66,<s> im plzzzz cautious mandatory wtf pinchmysa...,0
376,<s> pdxsays lucypope respectful vinsharma mast...,1


In [91]:
dataset_B = raw_data.append(temp)
dataset_B.drop(['sentences'], axis=1, inplace=True)
dataset_B.rename(columns = {'TEXT': 'sentances'}, inplace=True)
print("dataset B shape :", dataset_B.shape)
print("raw dataset shape :", raw_data.shape)
dataset_B.head()

dataset B shape : (4787, 2)
raw dataset shape : (4287, 2)


Unnamed: 0,LABEL,sentances
0,0,About to get threaded and scared
1,1,@awaisnaseer I like Shezan Mangooo too!!! I ha...
2,1,worked on my car after work. showering then go...
3,1,@Marama Actually we start this afternoon! I w...
4,1,@gfalcone601 Aww Gi.don't worry.we'll vote for...


In [92]:
dataset_B.to_csv('dataset_B.csv', index=False)

## Perplexity evalutation

In [93]:
import math


def perpleixty_positive(sentence):
    words = sentence.split()
    n = len(words)
    result = 0
    
    for k in range(n-1):
        if k == 0:
            result += math.log(unigram_count1[words[k]] / len(vocab))
        else:
            w1 = words[k]
            w2 = words[k+1]
            result += math.log(p_bigrams_beta1[(w1, w2)])

    result = result * (-1 / n)
    result = math.exp(result)
    return result


def perpleixty_negative(sentence):
    words = sentence.split()
    n = len(words)
    result = 0
    
    for k in range(n-1):
        if k == 0:
            result += math.log(unigram_count2[words[k]] / len(vocab))
        else:
            w1 = words[k]
            w2 = words[k+1]
            result += math.log(p_bigrams_beta2[(w1, w2)])

    result = result * (-1 / n)
    result = math.exp(result)
    return result

In [36]:
with open('p_bigrams_beta1.pickle', 'rb') as handle1:
    p_bigrams_beta1 = pickle.load(handle1)
    
df_beta1 = df_beta1.sort_values('mod_vader_scores', ascending=False)
postive_perplexity_score  = df_beta1['sentences with beta'][:250].apply(perpleixty_positive).mean()
print("postive perplexity score is :", postive_perplexity_score)

del p_bigrams_beta1

postive perplexity score is : 3382.3087853833845


In [38]:
with open('p_bigrams_beta2.pickle', 'rb') as handle2:
    p_bigrams_beta2 = pickle.load(handle2)
    
df_beta2 = df_beta2.sort_values('mod_vader_scores', ascending=False)
negative_perplexity_score  = df_beta2['sentences with beta'][:250].apply(perpleixty_negative).mean()
print("negative perplexity score is :", negative_perplexity_score)

del p_bigrams_beta2

negative perplexity score is : 3402.3628079430305


In [39]:
ans = round((postive_perplexity_score + negative_perplexity_score) / 2, 2)
print("average perplexity score of 500 generated sentences : ", ans)

average perplexity score of 500 generated sentences :  3391.67


## Extrinsic evaluation

In [96]:
test_data = pd.read_csv('A2_test_dataset.csv')

Unnamed: 0,LABEL,DATE_TIME,TEXT
0,1,Fri May 29 22:24:26 2009,@mileycyrus cheer up miley whats wrong?
1,1,Sun Jun 07 01:37:36 2009,Just got back in from The Belcourt. Saw &quot;...
2,1,Wed May 13 23:41:18 2009,http://bit.ly/IQPPD with video
3,1,Sun May 31 16:43:58 2009,@chloebli heyy! how was your carnavilistic da...
4,1,Fri May 29 10:36:59 2009,@deadlyseagal http://twitpic.com/66zex - Nice ...


(644, 3)

In [97]:
test_data = preprocess(test_data)
test_data.shape

PREPROCESSING PIPELINE AT A GLANCE


Unnamed: 0,LABEL,DATE_TIME,TEXT,white_space_removed,tokenized_data,stopword_removed_data,punct_removed_data,url_removed_data,spelling_checked_data,lemmetized_data,preprocessed_txt
0,1,Fri May 29 22:24:26 2009,@mileycyrus cheer up miley whats wrong?,@mileycyrus cheer up miley whats wrong?,"[@, mileycyrus, cheer, up, miley, whats, wrong...","[@, mileycyrus, cheer, miley, whats, wrong, ?]","[mileycyrus, cheer, miley, whats, wrong]","[mileycyrus, cheer, miley, whats, wrong]","[mileycyrus, cheer, miley, whats, wrong]","[mileycyrus, cheer, miley, whats, wrong]",<s> mileycyrus cheer miley whats wrong </s>
1,1,Sun Jun 07 01:37:36 2009,Just got back in from The Belcourt. Saw &quot;...,Just got back in from The Belcourt. Saw &quot;...,"[just, got, back, in, from, the, belcourt, ., ...","[got, back, belcourt, ., saw, &, quot, ;, fift...","[got, back, belcourt, saw, quot, fifth, quot, ...","[got, back, belcourt, saw, quot, fifth, quot, ...","[got, back, belcourt, saw, quot, fifth, quot, ...","[get, back, belcourt, saw, quot, fifth, quot, ...",<s> get back belcourt saw quot fifth quot awes...
2,1,Wed May 13 23:41:18 2009,http://bit.ly/IQPPD with video,http://bit.ly/IQPPD with video,"[http, :, //bit.ly/iqppd, with, video]","[http, :, //bit.ly/iqppd, video]","[http, video]","[http, video]","[http, video]","[http, video]",<s> http video </s>
3,1,Sun May 31 16:43:58 2009,@chloebli heyy! how was your carnavilistic da...,@chloebli heyy! how was your carnavilistic day...,"[@, chloebli, heyy, !, how, was, your, carnavi...","[@, chloebli, heyy, !, carnavilistic, day, ?, ...","[chloebli, heyy, carnavilistic, day, woow, mad...","[chloebli, heyy, carnavilistic, day, woow, mad...","[chloebli, hey, carnavilistic, day, wood, made...","[chloebli, hey, carnavilistic, day, wood, make...",<s> chloebli hey carnavilistic day wood make w...
4,1,Fri May 29 10:36:59 2009,@deadlyseagal http://twitpic.com/66zex - Nice ...,@deadlyseagal http://twitpic.com/66zex - Nice ...,"[@, deadlyseagal, http, :, //twitpic.com/66zex...","[@, deadlyseagal, http, :, //twitpic.com/66zex...","[deadlyseagal, http, nice, day]","[deadlyseagal, http, nice, day]","[deadlyseagal, http, nice, day]","[deadlyseagal, http, nice, day]",<s> deadlyseagal http nice day </s>


(644, 3)

In [102]:
# test_data.drop(['DATE_TIME'], axis=1, inplace=True)
display(test_data.head())
# test_data.rename(columns={'preprocessed_txt': 'sentances'}, axis=1, inplace=True)
test_data.shape

Unnamed: 0,LABEL,sentances
0,1,<s> mileycyrus cheer miley whats wrong </s>
1,1,<s> get back belcourt saw quot fifth quot awes...
2,1,<s> http video </s>
3,1,<s> chloebli hey carnavilistic day wood make w...
4,1,<s> deadlyseagal http nice day </s>


(644, 2)

In [None]:
test_data.head()

In [82]:
raw_data.head()

Unnamed: 0,LABEL,TEXT
0,0,About to get threaded and scared
1,1,@awaisnaseer I like Shezan Mangooo too!!! I ha...
2,1,worked on my car after work. showering then go...
3,1,@Marama Actually we start this afternoon! I w...
4,1,@gfalcone601 Aww Gi.don't worry.we'll vote for...


In [106]:
dataset_B = pd.read_csv('dataset_B.csv')
dataset_B = dataset_B.sample(frac=1)
dataset_B

In [107]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score


def train_and_evaluate(train_sentences, train_labels, test_sentences, test_labels):
    '''
    parameters:
    train_sentences : list of training sentences
    train_labels : list of training labels
    test_sentences : list of test sentences
    test_labels : list of test labels
    output:
    accuracy : accuracy of the test set
    '''
    
    # Model building
    model = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=30, random_state=333))
    
    # Training the model with the training data
    model.fit(train_sentences, train_labels)
    
    # Predicting the test data categories
    predicted_test_labels = model.predict(test_sentences)
    return accuracy_score(test_labels, predicted_test_labels)


acc_A = train_and_evaluate(raw_data.TEXT.to_list(), raw_data.LABEL.to_list(), 
                   test_data.TEXT.to_list(), test_data.LABEL.to_list())

acc_B = train_and_evaluate(dataset_B.sentances.to_list(), dataset_B.LABEL.to_list(), 
                   test_data.TEXT.to_list(), test_data.LABEL.to_list())

In [87]:
print("accracy on Dataset A is : ", acc_A)
print("accracy on dataset B is : ", acc_B)

accracy on Dataset A is : 0.785714285714285
accracy on dataset B is : 0.796583850931677
