In [1]:
import pandas as pd
import numpy as np
import pickle
from preprocess_text import *
from bigrams import *
from smoothers import *
from LanguageModels import *

In [2]:
df = pd.read_csv('A1_dataset.csv')

In [3]:
text = df['TEXT'].to_list()
def preprocess_text(text):
    text = lowercase_text(text)
    text = remove_url_html(text)
    text = remove_users(text)
    text = remove_punctuations(text)
    text = remove_whitespaces(text)
    text = tokenization(text)
    text = spelling_correction(text, 'textblob')
    return text
df['preprocessed_text'] = df['TEXT'].apply(preprocess_text)
preprocess_text = df['preprocessed_text'].to_list()

In [4]:
unigram_counts = {}
for sentence in preprocess_text:
    for word in sentence:
        if word in unigram_counts:
            unigram_counts[word] += 1
        else:
            unigram_counts[word] = 1

In [5]:
unigram_probs = {}
scale_factor = sum(unigram_counts.values())
for word in unigram_counts:
    unigram_probs[word] = unigram_counts[word] / scale_factor

# Saving Bigrams with Smoothing

In [6]:
# bigram_wrapper = Bigrams(preprocess_text)
# bigrams = bigram_wrapper.get_bigrams()
# vocab_len = bigram_wrapper.vocab_len

# with open('bigram_wrapper.pickle', 'wb') as f:
#     pickle.dump(bigram_wrapper, f)

In [7]:
# # load bigram_wrapper
# with open('bigram_wrapper.pickle', 'rb') as f:
#     bigram_wrapper = pickle.load(f)

# bigrams = bigram_wrapper.get_bigrams()
# vocab_len = bigram_wrapper.vocab_len

In [8]:
# smoother = LMSmoothers(bigrams, vocab_len, unigram_counts=unigram_counts, unigrams_probs=unigram_probs)
# laplace_smoothed_bigrams = smoother.laplace()
# add_5_smoothed_bigrams = smoother.add_k(5)
# add_5_with_unigram_prior_smoothed_bigrams = smoother.add_k_with_unigram_prior(5)

In [9]:
# with open('laplace_smoothed_bigrams.pickle', 'wb') as handle:
#     pickle.dump(laplace_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('add_5_smoothed_bigrams.pickle', 'wb') as handle:
#     pickle.dump(add_5_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('add_5_with_unigram_prior_smoothed_bigrams.pickle', 'wb') as handle:
#     pickle.dump(add_5_with_unigram_prior_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Creating BiGram Models

In [11]:
with open('laplace_smoothed_bigrams.pickle', 'rb') as handle:
    laplace_smoothed_bigrams = pickle.load(handle)

# with open('add_5_smoothed_bigrams.pickle', 'rb') as handle:
#     add_5_smoothed_bigrams = pickle.load(handle)

# with open('add_5_with_unigram_prior_smoothed_bigrams.pickle', 'rb') as handle:
#     add_5_with_unigram_prior_smoothed_bigrams = pickle.load(handle)

In [14]:
addToNum = [True, False]
addToDeno = [True, False]
addExternally = [True, False]
for addNum in addToNum:
    for addDeno in addToDeno:
        for addExt in addExternally:
            lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", addNum, addDeno, addExt)
            print("addNum: {}, addDeno: {}, addExt: {}".format(addNum, addDeno, addExt))
            sentence_pos = lm.generate_text(['i', 'am'], sentiment=1, length=7)
            sentence_neg = lm.generate_text(['i', 'am'], sentiment=-1, length=7)
            print("Positive: {}".format(sentence_pos))
            print("Negative: {}".format(sentence_neg))
            print("Positive Perplexity: {}".format(lm.computePerplexity(sentence_pos)))
            print("Negative Perplexity: {}".format(lm.computePerplexity(sentence_neg)))

addNum: True, addDeno: True, addExt: True
Positive: ['i', 'am', 'so', 'much', 'for', 'the', 'lobby']
Negative: ['i', 'am', 'so', 'much', 'for', 'the', 'chilling']
Positive Perplexity: 126478.20653797932
Negative Perplexity: 139643.26154685358
addNum: True, addDeno: True, addExt: False
Positive: ['i', 'am', 'so', 'much', 'for', 'the', 'lobby']
Negative: ['i', 'am', 'so', 'much', 'for', 'the', 'chilling']
Positive Perplexity: 126478.20653797932
Negative Perplexity: 139643.26154685358
addNum: True, addDeno: False, addExt: True
Positive: ['i', 'am', 'ily', 'ily', 'ily', 'ily', 'ily']
Negative: ['i', 'am', 'raping', 'raping', 'raping', 'raping', 'raping']
Positive Perplexity: 790905.9358747303
Negative Perplexity: 790905.923352325
addNum: True, addDeno: False, addExt: False
Positive: ['i', 'am', 'ily', 'ily', 'ily', 'ily', 'ily']
Negative: ['i', 'am', 'raping', 'raping', 'raping', 'raping', 'raping']
Positive Perplexity: 790905.9358747303
Negative Perplexity: 790905.923352325
addNum: False,