In [3]:
import pandas as pd
import numpy as np
import pickle
from preprocess_text import *
from bigrams import *
from smoothers import *
from LanguageModels import *

In [4]:
df = pd.read_csv('A1_dataset.csv')

In [3]:
# text = df['TEXT'].to_list()
# def preprocess_text(text):
#     text = lowercase_text(text)
#     text = remove_url_html(text)
#     text = remove_users(text)
#     text = remove_punctuations(text)
#     text = remove_whitespaces(text)
#     text = tokenization(text)
#     text = spelling_correction(text, 'autocorrect_full')
#     text = remove_alphanum(text)
#     return text
# df['preprocessed_text'] = df['TEXT'].progress_apply(preprocess_text)
# preprocess_text = df['preprocessed_text'].to_list()
# df.to_csv('preprocessed_A1.csv', index=False)

In [5]:
df = pd.read_csv('preprocessed_A1.csv')
preprocess_text = df['preprocessed_text'].to_list()
for i in range(len(preprocess_text)):
    preprocess_text[i] = eval(preprocess_text[i])

In [6]:
unigram_counts = {}
for sentence in preprocess_text:
    for word in sentence:
        if word in unigram_counts:
            unigram_counts[word] += 1
        else:
            unigram_counts[word] = 1

In [7]:
unigram_probs = {}
scale_factor = sum(unigram_counts.values())
for word in unigram_counts:
    unigram_probs[word] = unigram_counts[word] / scale_factor

In [8]:
len(unigram_counts)

6952

# Saving Bigrams with Smoothing

In [9]:
bigram_wrapper = Bigrams(preprocess_text)
bigrams = bigram_wrapper.get_bigrams()
vocab_len = bigram_wrapper.vocab_len

with open('bigram_wrapper.pickle', 'wb') as f:
    pickle.dump(bigram_wrapper, f)

In [9]:
# load bigram_wrapper
# with open('bigram_wrapper.pickle', 'rb') as f:
#     bigram_wrapper = pickle.load(f)

# bigrams = bigram_wrapper.get_bigrams()
# vocab_len = bigram_wrapper.vocab_len

In [10]:
smoother = LMSmoothers(bigrams, vocab_len, unigram_counts=unigram_counts, unigrams_probs=unigram_probs)
laplace_smoothed_bigrams = smoother.laplace()
# add_5_smoothed_bigrams = smoother.add_k(5)
# add_5_with_unigram_prior_smoothed_bigrams = smoother.add_k_with_unigram_prior(5)

In [11]:
with open('laplace_smoothed_bigrams.pickle', 'wb') as handle:
    pickle.dump(laplace_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('add_5_smoothed_bigrams.pickle', 'wb') as handle:
#     pickle.dump(add_5_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('add_5_with_unigram_prior_smoothed_bigrams.pickle', 'wb') as handle:
#     pickle.dump(add_5_with_unigram_prior_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Creating BiGram Models

In [12]:
with open('laplace_smoothed_bigrams.pickle', 'rb') as handle:
    laplace_smoothed_bigrams = pickle.load(handle)

# with open('add_5_smoothed_bigrams.pickle', 'rb') as handle:
#     add_5_smoothed_bigrams = pickle.load(handle)

# with open('add_5_with_unigram_prior_smoothed_bigrams.pickle', 'rb') as handle:
#     add_5_with_unigram_prior_smoothed_bigrams = pickle.load(handle)

In [13]:
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", False, False, True, repetition_penalty=0.2)

In [14]:
# Load Positive Prompts
with open('pos_250_vader.txt', 'rb') as handle:
    pos_prompts = handle.readlines()

for i in range(len(pos_prompts)):
    pos_prompts[i] = pos_prompts[i].decode('utf-8').strip()

In [15]:
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", False, False, True, repetition_penalty=0.2)
lm.generate_text(['paradise'], 1, 7)

['paradise', 'glorious', 'perfectly', 'freedom', 'best', 'friend', 'just']

In [16]:
lm.computePerplexity(lm.generate_text(['paradise'], 1, 7))

532.5508575134819

In [17]:
# clear the file
open('pos_gen_only_ext.txt', 'w').close()

In [18]:
def append_to_file(filename, text1, text2):
    with open(filename, 'a') as f:
        f.write(text1 + " " + text2 + '\n')

In [19]:
ls_pos_gen = []
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", False, False, True, repetition_penalty=0.2)
for p in pos_prompts:
    sentence_pos = lm.generate_text([p], sentiment=1, length=10)
    print("Positive: {}".format(sentence_pos))
    ppl = lm.computePerplexity(sentence_pos)
    print("Positive Perplexity: {}".format(ppl))
    ls_pos_gen.append((sentence_pos, ppl))
    append_to_file('pos_gen_only_ext.txt', " ".join(sentence_pos), str(ppl))

Positive: ['paradise', 'love', 'to', 'be', 'a', 'great', 'time', 'i', 'dont', 'feel']
Positive Perplexity: 644.8304013661058
Positive: ['perfectly', 'then', 'going', 'to', 'see', 'it', 'was', 'a', 'good', 'morning']
Positive Perplexity: 467.53033997787765
Positive: ['freedom', 'mother', 'n', 'sunbathin', 'perfectly', 'also', 'i', 'have', 'to', 'the']
Positive Perplexity: 1321.0342315659989
Positive: ['greatest', 'love', 'to', 'get', 'it', 'was', 'awesome', 'now', 'i', 'am']
Positive Perplexity: 635.7237523928185
Positive: ['glorious', 'sunshine', 'with', 'my', 'friends', 'over', 'to', 'see', 'you', 'are']
Positive Perplexity: 833.8439584139823
Positive: ['best', 'friends', 'over', 'a', 'good', 'morning', 'it', 'would', 'have', 'a']
Positive Perplexity: 793.4322208551166
Positive: ['love', 'that', 'you', 'have', 'to', 'be', 'in', 'the', 'best', 'friends']
Positive Perplexity: 428.3608370719978
Positive: ['awesome', 'i', 'was', 'a', 'great', 'day', 'to', 'be', 'the', 'best']
Positive Per

In [None]:
# save generated sentences
with open('ls_pos_gen_vader_thread.pickle', 'wb') as handle:
    pickle.dump(ls_pos_gen, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
ls_pos_gen = []
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", True, False, True, repetition_penalty=0.2, sentiment_scale_factor=1e-1)
for p in pos_prompts:
    sentence_pos = lm.generate_text([p], sentiment=1, length=10)
    print("Positive: {}".format(sentence_pos))
    ppl = lm.computePerplexity(sentence_pos)
    print("Positive Perplexity: {}".format(ppl))
    ls_pos_gen.append((sentence_pos, ppl))
    append_to_file('pos_gen_add_numerator.txt', " ".join(sentence_pos), str(ppl))

Positive: ['paradise', 'greatest', 'love', 'glorious', 'freedom', 'glorious', 'love', 'best', 'paradise', 'greatest']
Positive Perplexity: 6997.182178002751
Positive: ['perfectly', 'greatest', 'love', 'freedom', 'best', 'glorious', 'freedom', 'glorious', 'best', 'love']
Positive Perplexity: 6982.36023037396
Positive: ['freedom', 'love', 'paradise', 'love', 'glorious', 'freedom', 'perfectly', 'paradise', 'best', 'best']
Positive Perplexity: 6996.980953426512
Positive: ['greatest', 'perfectly', 'paradise', 'glorious', 'love', 'freedom', 'love', 'perfectly', 'best', 'glorious']
Positive Perplexity: 6997.081564991271
Positive: ['glorious', 'freedom', 'love', 'greatest', 'paradise', 'best', 'love', 'perfectly', 'greatest', 'paradise']
Positive Perplexity: 6997.182178002751
Positive: ['best', 'perfectly', 'freedom', 'paradise', 'love', 'perfectly', 'glorious', 'freedom', 'love', 'paradise']
Positive Perplexity: 6996.980953426512
Positive: ['love', 'greatest', 'paradise', 'freedom', 'perfectl

In [21]:
ls_pos_gen = []
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", True, False, True, repetition_penalty=0.2)
for p in pos_prompts:
    sentence_pos = lm.generate_text([p], sentiment=1, length=10)
    print("Positive: {}".format(sentence_pos))
    ppl = lm.computePerplexity(sentence_pos)
    print("Positive Perplexity: {}".format(ppl))
    ls_pos_gen.append((sentence_pos, ppl))
    append_to_file('pos_gen_mul_numerator.txt', " ".join(sentence_pos), str(ppl))

Positive: ['paradise', 'look', 'at', 'all', 'of', 'it', 'is', 'so', 'much', 'better']
Positive Perplexity: 822.1811178716018
Positive: ['perfectly', 'also', 'enjoyed', 'that', 'you', 'are', 'going', 'to', 'see', 'you']
Positive Perplexity: 781.0482944221122
Positive: ['freedom', 'of', 'the', 'new', 'glasses', 'greatest', 'show', 'tonight', 'was', 'awesome']
Positive Perplexity: 1555.0640124172087
Positive: ['greatest', 'show', 'in', 'the', 'day', 'to', 'work', 'out', 'on', 'my']
Positive Perplexity: 701.0475947712753
Positive: ['glorious', 'weather', 'and', 'a', 'good', 'morning', 'all', 'the', 'last', 'day']
Positive Perplexity: 715.7772836403662


In [None]:
ls_pos_gen = []
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", False, True, True, repetition_penalty=0.2)
for p in pos_prompts:
    sentence_pos = lm.generate_text([p], sentiment=1, length=10)
    print("Positive: {}".format(sentence_pos))
    ppl = lm.computePerplexity(sentence_pos)
    print("Positive Perplexity: {}".format(ppl))
    ls_pos_gen.append((sentence_pos, ppl))
    append_to_file('pos_gen_div_denominator.txt', " ".join(sentence_pos), str(ppl))