In [53]:
import numpy as np
import matplotlib.pyplot as plt
import re
from utils import emotion_scores



In [49]:
# Implement the Bigram Language Model Class
class BigramLM:
    def __init__(self):
        self.bigram_freq = {}
        self.total_freq = {}
        self.vocab = set()
        self.vocab_size = 0
        self.corpus_size = 0
        self.corpus = [] 
        self.labels = []

    def preprocess_text(self, text):
        # Preprocess the text
        # Convert to lowercase
        text = text.lower()

        # Remove all non-alphanumeric characters
        text = re.sub(r'[^a-z0-9\s]', '', text)
        return text
    
    def train_lm(self, corpus, labels):
        # Open the corpus and labels file and read it
        with open(corpus, 'r', encoding='utf-8') as corpus_file, open(labels, 'r', encoding='utf-8') as labels_file:
            for line in corpus_file:
                tokens = self.preprocess_text(line).split()
                self.corpus.append(tokens)

            for line in labels_file:
                self.labels.append(line.strip())
        
        # Get the vocabulary
        self.vocab = set([word for sentence in self.corpus for word in sentence])

        # Get the vocabulary size
        self.vocab_size = len(self.vocab)

        # Get the corpus size
        self.corpus_size = len(self.corpus)

        for sentence in self.corpus:
            for i in range(len(sentence) - 1):

                # Get the bigram frequencies
                bigram = (sentence[i], sentence[i + 1])
                self.bigram_freq[bigram] = self.bigram_freq.get(bigram, 0) + 1

                # Get the unigram frequencies
                unigram = sentence[i]
                self.total_freq[unigram] = self.total_freq.get(unigram, 0) + 1
                if(i == len(sentence) - 2):
                    unigram = sentence[i + 1]
                    self.total_freq[unigram] = self.total_freq.get(unigram, 0) + 1

    # Get the probability of a bigram without smoothing
    def get_bigram_prob(self, previous_word, next_word):
        bigram = (previous_word, next_word)
        # If the bigram is not present in the corpus, return 0
        if bigram not in self.bigram_freq:
            return 0
        else:
            return self.bigram_freq[bigram] / self.total_freq[previous_word]
        
    # Q2 
    # Get the probability of a bigram with laplace smoothing
    def get_bigram_prob_laplace(self, previous_word, next_word):
        bigram = (previous_word, next_word)
        
        # Retrieve frequency of bigram and add 1
        bigram_freq_smoothed = self.bigram_freq.get(bigram, 0) + 1

        # Retrieve frequency of previous word and add the vocabulary size
        previous_word_freq_smoothed = self.total_freq.get(previous_word, 0) + self.vocab_size

        # Return the probability
        return bigram_freq_smoothed / previous_word_freq_smoothed
    
    # Get the probability of a bigram with kneser-ney smoothing
    

In [50]:
bigram_lm = BigramLM()

bigram_lm.train_lm('data/corpus.txt', 'data/labels.txt')

In [51]:
# Get the probability of a bigram without smoothing
print('Probability of bigram without smoothing: ', bigram_lm.get_bigram_prob('arnav', 'medha')) 

# Get the probability of a bigram with laplace smoothing
print('Probability of bigram with laplace smoothing: ', bigram_lm.get_bigram_prob_laplace('arnav', 'medha'))

Probability of bigram without smoothing:  0
Probability of bigram with laplace smoothing:  0.0001841959845275373
