In [1]:
import pandas as pd
import numpy as np
import pickle
from preprocess_text import *
from bigrams import *
from smoothers import *
from LanguageModels import *

In [2]:
df = pd.read_csv('A1_dataset.csv')

In [3]:
text = df['TEXT'].to_list()
def preprocess_text(text):
    text = lowercase_text(text)
    text = remove_url_html(text)
    text = remove_users(text)
    text = remove_punctuations(text)
    text = remove_whitespaces(text)
    text = tokenization(text)
    text = spelling_correction(text, 'textblob')
    return text
df['preprocessed_text'] = df['TEXT'].apply(preprocess_text)
preprocess_text = df['preprocessed_text'].to_list()

In [4]:
unigram_counts = {}
for sentence in preprocess_text:
    for word in sentence:
        if word in unigram_counts:
            unigram_counts[word] += 1
        else:
            unigram_counts[word] = 1

In [5]:
unigram_probs = {}
scale_factor = sum(unigram_counts.values())
for word in unigram_counts:
    unigram_probs[word] = unigram_counts[word] / scale_factor

# Sentiment Score Dump

In [6]:
# from tqdm import tqdm
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# from transformers import pipeline

In [7]:
# sid = SentimentIntensityAnalyzer()
# ls_word_sentiment_vader = []
# for word in tqdm(unigram_counts):
#     ls_word_sentiment_vader.append((word, sid.polarity_scores(word)['compound']))
# with open('ls_word_sentiment_vader.pickle', 'wb') as f:
#     pickle.dump(ls_word_sentiment_vader, f)

In [8]:
# hf_sentiment_model = pipeline('sentiment-analysis')
# ls_word_sentiment_hf = []
# for word in tqdm(unigram_counts):
#     hf_res = hf_sentiment_model(word)
#     score = hf_res[0]['score']
#     pos_neg = hf_res[0]['label']
#     if pos_neg == 'NEGATIVE':
#         score = -score
#     ls_word_sentiment_hf.append((word,score))
# with open('ls_word_sentiment_hf.pickle', 'wb') as f:
#     pickle.dump(ls_word_sentiment_hf, f)


# Saving Bigrams with Smoothing

In [11]:
# bigram_wrapper = Bigrams(preprocess_text)
# bigrams = bigram_wrapper.get_bigrams()
# vocab_len = bigram_wrapper.vocab_len

# with open('bigram_wrapper.pickle', 'wb') as f:
#     pickle.dump(bigram_wrapper, f)

In [None]:
# load bigram_wrapper
with open('bigram_wrapper.pickle', 'rb') as f:
    bigram_wrapper = pickle.load(f)

bigrams = bigram_wrapper.get_bigrams()
vocab_len = bigram_wrapper.vocab_len

In [10]:
smoother = LMSmoothers(bigrams, vocab_len, unigram_counts=unigram_counts, unigram_probs=unigram_probs)
laplace_smoothed_bigrams = smoother.laplace()
add_5_smoothed_bigrams = smoother.add_k(5)
add_5_with_unigram_prior_smoothed_bigrams = smoother.add_k_with_unigram_prior(5)

TypeError: __init__() got an unexpected keyword argument 'unigram_probs'

In [10]:
with open('laplace_smoothed_bigrams.pickle', 'wb') as handle:
    pickle.dump(laplace_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('add_5_smoothed_bigrams.pickle', 'wb') as handle:
    pickle.dump(add_5_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('add_5_with_unigram_prior_smoothed_bigrams.pickle', 'wb') as handle:
    pickle.dump(add_5_with_unigram_prior_smoothed_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
with open('laplace_smoothed_bigrams.pickle', 'rb') as handle:
    laplace_smoothed_bigrams = pickle.load(handle)

with open('add_5_smoothed_bigrams.pickle', 'rb') as handle:
    add_5_smoothed_bigrams = pickle.load(handle)

with open('add_5_with_unigram_prior_smoothed_bigrams.pickle', 'rb') as handle:
    add_5_with_unigram_prior_smoothed_bigrams = pickle.load(handle)

In [12]:
BigramLM = LanguageModel(laplace_smoothed_bigrams, unigram_counts, 'vader')

In [13]:
sentence_pos = BigramLM.generate_text(['i', 'am'], sentiment=1, length=7)

In [14]:
sentence_pos

['i', 'am', 'ily', 'paradise', 'best', 'glorious', 'perfectly']

In [15]:
sentence_neg = BigramLM.generate_text(['i', 'am'], sentiment=-1, length=7)

In [16]:
sentence_neg

['i', 'am', 'raping', 'kill', 'hell', 'killed', 'shittiest']

In [17]:
BigramLM2 = LanguageModel(laplace_smoothed_bigrams, unigram_counts, 'hf')

In [18]:
sentence_pos2 = BigramLM2.generate_text(['i', 'am'], sentiment=1, length=7)

In [19]:
sentence_pos2

['i', 'am', 'bliss', 'happiness', 'enjoying', 'loved', 'beautiful']

In [20]:
sentence_neg2 = BigramLM2.generate_text(['i', 'am'], sentiment=-1, length=7)

In [21]:
sentence_neg2

['i', 'am', 'pathetic', 'irritating', 'letdown', 'disappointments', 'worst']