In [1]:
import pandas as pd
import numpy as np
import pickle
from preprocess_text import *
from bigrams import *
from smoothers import *
from LanguageModels import *

In [2]:
df = pd.read_csv('A1_dataset.csv')

In [3]:
# text = df['TEXT'].to_list()
# def preprocess_text(text):
#     text = lowercase_text(text)
#     text = remove_url_html(text)
#     text = remove_users(text)
#     text = remove_punctuations(text)
#     text = remove_whitespaces(text)
#     text = tokenization(text)
#     text = spelling_correction(text, 'autocorrect_full')
#     text = remove_alphanum(text)
#     return text
# df['preprocessed_text'] = df['TEXT'].progress_apply(preprocess_text)
# preprocess_text = df['preprocessed_text'].to_list()
# df.to_csv('preprocessed_A1.csv', index=False)

In [4]:
df = pd.read_csv('preprocessed_A1.csv')
preprocess_text = df['preprocessed_text'].to_list()
for i in range(len(preprocess_text)):
    preprocess_text[i] = eval(preprocess_text[i])
    

In [5]:
unigram_counts = {}
for sentence in preprocess_text:
    for word in sentence:
        if word in unigram_counts:
            unigram_counts[word] += 1
        else:
            unigram_counts[word] = 1

In [6]:
unigram_probs = {}
scale_factor = sum(unigram_counts.values())
for word in unigram_counts:
    unigram_probs[word] = unigram_counts[word] / scale_factor

In [7]:
len(unigram_counts)

6952

# Saving Bigrams with Smoothing

In [8]:
bigram_wrapper = Bigrams(preprocess_text)
bigrams = bigram_wrapper.get_bigrams()
vocab_len = bigram_wrapper.vocab_len

with open('bigram_wrapper.pickle', 'wb') as f:
    pickle.dump(bigram_wrapper, f)

In [9]:
# load bigram_wrapper
# with open('bigram_wrapper.pickle', 'rb') as f:
#     bigram_wrapper = pickle.load(f)

# bigrams = bigram_wrapper.get_bigrams()
# vocab_len = bigram_wrapper.vocab_len

In [10]:
smoother = LMSmoothers(bigrams, vocab_len, unigram_counts=unigram_counts, unigrams_probs=unigram_probs)
laplace_smoothed_bigrams = smoother.laplace()
# add_5_smoothed_bigrams = smoother.add_k(5)
# add_5_with_unigram_prior_smoothed_bigrams = smoother.add_k_with_unigram_prior(5)

In [11]:
with open('laplace_smoothed_bigrams.pickle', 'wb') as handle:
    pickle.dump(laplace_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('add_5_smoothed_bigrams.pickle', 'wb') as handle:
#     pickle.dump(add_5_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('add_5_with_unigram_prior_smoothed_bigrams.pickle', 'wb') as handle:
#     pickle.dump(add_5_with_unigram_prior_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Creating BiGram Models

In [12]:
with open('laplace_smoothed_bigrams.pickle', 'rb') as handle:
    laplace_smoothed_bigrams = pickle.load(handle)

# with open('add_5_smoothed_bigrams.pickle', 'rb') as handle:
#     add_5_smoothed_bigrams = pickle.load(handle)

# with open('add_5_with_unigram_prior_smoothed_bigrams.pickle', 'rb') as handle:
#     add_5_with_unigram_prior_smoothed_bigrams = pickle.load(handle)

In [13]:
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", False, False, True, repetition_penalty=0.2)

In [14]:
# Load Negative Prompts
with open('neg_250_vader.txt', 'rb') as handle:
    neg_prompts = handle.readlines()

for i in range(len(neg_prompts)):
    neg_prompts[i] = neg_prompts[i].decode('utf-8').strip()

In [15]:
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", False, False, True, repetition_penalty=0.2)
lm.generate_text(['hell'], -1, 7)

['hell', 'yeah', 'ive', 'been', 'a', 'little', 'bit']

In [16]:
lm.computePerplexity(lm.generate_text(['hell'], -1, 7))

641.2602032666833

In [17]:
# clear the file
open('neg_gen_only_ext.txt', 'w').close()

In [18]:
def append_to_file(filename, text1, text2):
    with open(filename, 'a') as f:
        f.write(text1 + " " + text2 + '\n')

In [19]:
ls_neg_gen = []
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", False, False, True, repetition_penalty=0.2)
for p in neg_prompts:
    sentence_neg = lm.generate_text([p], sentiment=-1, length=10)
    print("Negative: {}".format(sentence_neg))
    ppl = lm.computePerplexity(sentence_neg)
    print("Negative Perplexity: {}".format(ppl))
    ls_neg_gen.append((sentence_neg, ppl))
    append_to_file('neg_gen_only_ext.txt', " ".join(sentence_neg), str(ppl))

Negative: ['raping', 'hell', 'is', 'a', 'great', 'weekend', 'to', 'be', 'a', 'few']
Negative Perplexity: 829.9808557932836
Negative: ['slavery', 'raping', 'tragedy', 'hell', 'is', 'the', 'new', 'iphone', 'from', 'the']
Negative Perplexity: 2086.6723835529174
Negative: ['kill', 'raping', 'slavery', 'killed', 'the', 'best', 'of', 'my', 'mom', 'got']
Negative Perplexity: 2014.3949155207056
Negative: ['hell', 'everybody', 'is', 'the', 'new', 'followers', 'a', 'good', 'luck', 'with']
Negative Perplexity: 1225.1286728660962
Negative: ['killed', 'tea', 'yet', 'again', 'and', 'i', 'have', 'to', 'get', 'my']
Negative Perplexity: 708.6973256261734
Negative: ['killing', 'me', 'too', 'i', 'have', 'to', 'the', 'day', 'with', 'my']
Negative Perplexity: 420.33889155784993
Negative: ['evil', 'and', 'i', 'am', 'not', 'the', 'same', 'name', 'with', 'me']
Negative Perplexity: 824.5222126215607
Negative: ['horrific', 'slavery', 'hell', 'yeah', 'i', 'have', 'to', 'the', 'day', 'to']
Negative Perplexity: 77

In [20]:
# save generated sentences
with open('ls_neg_gen_vader_thread.pickle', 'wb') as handle:
    pickle.dump(ls_neg_gen, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
ls_neg_gen = []
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", True, False, True, repetition_penalty=0.2, sentiment_scale_factor=1e-1)
for p in neg_prompts:
    sentence_neg = lm.generate_text([p], sentiment=-1, length=10)
    print("Negative: {}".format(sentence_neg))
    ppl = lm.computePerplexity(sentence_neg)
    print("Negative Perplexity: {}".format(ppl))
    ls_neg_gen.append((sentence_neg, ppl))
    append_to_file('neg_gen_add_numerator.txt', " ".join(sentence_neg), str(ppl))

Negative: ['raping', 'slavery', 'kill', 'hell', 'kill', 'kill', 'hell', 'killed', 'slavery', 'slavery']
Negative Perplexity: 6957.298289490414
Negative: ['slavery', 'kill', 'kill', 'killed', 'killed', 'raping', 'raping', 'killed', 'kill', 'slavery']
Negative Perplexity: 6955.899475957435
Negative: ['kill', 'slavery', 'hell', 'kill', 'slavery', 'raping', 'raping', 'hell', 'hell', 'raping']
Negative Perplexity: 6957.597640824956
Negative: ['hell', 'killed', 'killed', 'killed', 'raping', 'hell', 'hell', 'raping', 'slavery', 'slavery']
Negative Perplexity: 6958.597827424857
Negative: ['killed', 'hell', 'killed', 'kill', 'slavery', 'hell', 'hell', 'hell', 'raping', 'kill']
Negative Perplexity: 6959.697729014282
Negative: ['killing', 'hell', 'killed', 'hell', 'hell', 'kill', 'slavery', 'raping', 'kill', 'kill']
Negative Perplexity: 6958.898010682494
Negative: ['evil', 'hell', 'slavery', 'slavery', 'hell', 'raping', 'kill', 'hell', 'raping', 'raping']
Negative Perplexity: 6957.397597763808
Ne

In [21]:
ls_neg_gen = []
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", True, False, True, repetition_penalty=0.2)
for p in neg_prompts:
    sentence_neg = lm.generate_text([p], sentiment=-1, length=10)
    print(sentence_neg)
    print("Negative: {}".format(sentence_neg))
    ppl = lm.computePerplexity(sentence_neg)
    print("Negative Perplexity: {}".format(ppl))
    ls_neg_gen.append((sentence_neg, ppl))
    append_to_file('neg_gen_mul_numerator.txt', " ".join(sentence_neg), str(ppl))

['raping', 'slavery', 'and', 'a', 'great', 'time', 'to', 'the', 'best', 'i']
Negative: ['raping', 'slavery', 'and', 'a', 'great', 'time', 'to', 'the', 'best', 'i']
Negative Perplexity: 901.6885266295739
['slavery', 'killed', 'tea', 'so', 'i', 'dont', 'want', 'to', 'the', 'last']
Negative: ['slavery', 'killed', 'tea', 'so', 'i', 'dont', 'want', 'to', 'the', 'last']
Negative Perplexity: 719.3617643744873
['kill', 'me', 'too', 'early', 'today', 'i', 'was', 'just', 'woke', 'up']
Negative: ['kill', 'me', 'too', 'early', 'today', 'i', 'was', 'just', 'woke', 'up']
Negative Perplexity: 761.6359982131581
['hell', 'everybody', 'day', 'to', 'the', 'same', 'page', 'of', 'my', 'phone']
Negative: ['hell', 'everybody', 'day', 'to', 'the', 'same', 'page', 'of', 'my', 'phone']
Negative Perplexity: 1041.171891177359
['killed', 'a', 'good', 'day', 'with', 'you', 'can', 'get', 'my', 'mom']
Negative: ['killed', 'a', 'good', 'day', 'with', 'you', 'can', 'get', 'my', 'mom']
Negative Perplexity: 814.522345506

In [None]:
ls_neg_gen = []
lm = LanguageModel(laplace_smoothed_bigrams, unigram_probs, "vader", True, False, True, repetition_penalty=0.2)
for p in neg_prompts:
    sentence_neg = lm.generate_text([p], sentiment=-1, length=10)
    print(sentence_neg)
    print("Negative: {}".format(sentence_neg))
    ppl = lm.computePerplexity(sentence_neg)
    print("Negative Perplexity: {}".format(ppl))
    ls_neg_gen.append((sentence_neg, ppl))
    append_to_file('neg_gen_div_denominator.txt', " ".join(sentence_neg), str(ppl))