**QUESTION 2 :  LANGUAGE MODELS**

---



In [None]:
import re
import math
import random
# used for unseen words in training vocabularies
UNK = "<unk>"
# sentence start and end
SENTENCE_START = "<s>"
SENTENCE_END = "</s>"
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove symbols, emojis, and spaces
    text = re.sub(r'[^\w\s]', '', text)

    # Check if the sentence is empty after cleaning
    if not text.strip():
        return None

    return text

def read_sentences_from_file(file_path):
    with open(file_path, "r") as f:
        sentences = [re.split("\s+", line.rstrip('\n')) for line in f]

    # Add start and end markers, and apply cleaning to each sentence
    processed_sentences = [
        [SENTENCE_START] + [clean_text(word) for word in sentence if clean_text(word) is not None] + [SENTENCE_END]
        for sentence in sentences
    ]

    # Filter out empty sentences
    processed_sentences = [sentence for sentence in processed_sentences if len(sentence) > 2]

    return processed_sentences




  sentences = [re.split("\s+", line.rstrip('\n')) for line in f]


In [None]:
from collections import defaultdict

class UnigramLanguageModel:
    def __init__(self, sentences, smoothing=False):
        self.unigram_frequencies = defaultdict(int)
        self.corpus_length = 0

        for sentence in sentences:
            for word in sentence:
                self.unigram_frequencies[word] += 1
                if word != SENTENCE_START and word != SENTENCE_END:
                    self.corpus_length += 1

        self.unique_words = len(self.unigram_frequencies) - 2
        self.smoothing = smoothing

    def calculate_unigram_probability(self, word):
        word_probability_numerator = self.unigram_frequencies.get(word, 0)

        if self.smoothing:
            word_probability_numerator += 1
            word_probability_denominator = self.corpus_length + self.unique_words + 1
        else:
            word_probability_denominator = self.corpus_length

        return float(word_probability_numerator) / float(word_probability_denominator)

    def calculate_sentence_probability(self, sentence, normalize_probability=True):
        sentence_probability_log_sum = 0
        for word in sentence:
            if word != SENTENCE_START and word != SENTENCE_END:
                word_probability = self.calculate_unigram_probability(word)
                sentence_probability_log_sum += math.log(word_probability, 2)

        return math.pow(2, sentence_probability_log_sum) if normalize_probability else sentence_probability_log_sum

    def sorted_vocabulary(self):
        full_vocab = list(self.unigram_frequencies.keys())

        if SENTENCE_START in full_vocab:
            full_vocab.remove(SENTENCE_START)
        if SENTENCE_END in full_vocab:
            full_vocab.remove(SENTENCE_END)

        full_vocab.sort()
        full_vocab.append(UNK)
        full_vocab.append(SENTENCE_START)
        full_vocab.append(SENTENCE_END)

        return full_vocab

    def generateSentence(self):
        sentence = [SENTENCE_START]
        current_word = SENTENCE_START

        while current_word != SENTENCE_END:
            vocabulary = list(self.unigram_frequencies.keys())
            probabilities = [self.calculate_unigram_probability(word) for word in vocabulary]
            current_word = random.choices(vocabulary, probabilities)[0]
            sentence.append(current_word)

        return sentence

    def getSentenceProbability(self, sentence):
        probability_log_sum = 0
        for word in sentence:
            if word != SENTENCE_START and word != SENTENCE_END:
                word_probability = self.calculate_unigram_probability(word)
                probability_log_sum += math.log(word_probability, 2)

        return math.pow(2, probability_log_sum)



In [None]:
class BigramLanguageModel(UnigramLanguageModel):
    def __init__(self, sentences, smoothing=False):
        UnigramLanguageModel.__init__(self, sentences, smoothing)
        self.bigram_frequencies = dict()
        self.unique_bigrams = set()

        for sentence in sentences:
            previous_word = None
            for word in sentence:
                if previous_word is not None:
                    bigram = (previous_word, word)
                    self.bigram_frequencies[bigram] = self.bigram_frequencies.get(bigram, 0) + 1
                    if previous_word != SENTENCE_START and word != SENTENCE_END:
                        self.unique_bigrams.add(bigram)
                previous_word = word

        self.unique_bigram_words = len(self.unique_bigrams)

    def calculate_bigram_probability(self, previous_word, word):
        bigram_word_probability_numerator = self.bigram_frequencies.get((previous_word, word), 0)
        bigram_word_probability_denominator = self.unigram_frequencies.get(previous_word, 0)

        if self.smoothing:
            bigram_word_probability_numerator += 1
            bigram_word_probability_denominator += 1

        return 0.0 if bigram_word_probability_numerator == 0 or bigram_word_probability_denominator == 0 else float(
            bigram_word_probability_numerator) / float(bigram_word_probability_denominator)

    def calculate_bigram_sentence_probability(self, sentence, normalize_probability=True):
        bigram_sentence_probability_log_sum = 0
        previous_word = None

        for word in sentence:
            if previous_word is not None:
                bigram_word_probability = self.calculate_bigram_probability(previous_word, word)
                bigram_sentence_probability_log_sum += math.log(bigram_word_probability, 2)

            previous_word = word

        return math.pow(2, bigram_sentence_probability_log_sum) if normalize_probability else bigram_sentence_probability_log_sum

    def generateSentence(self):
        sentence = [SENTENCE_START]
        current_word = SENTENCE_START

        while current_word != SENTENCE_END:
            possible_next_words = [word for word in self.unigram_frequencies.keys() if (current_word, word) in self.bigram_frequencies]
            probabilities = [self.calculate_bigram_probability(current_word, word) for word in possible_next_words]
            current_word = random.choices(possible_next_words, probabilities)[0]
            sentence.append(current_word)

        return sentence

    def getSentenceProbability(self, sentence):
        probability_log_sum = 0
        previous_word = None

        for word in sentence:
            if previous_word is not None:
                bigram_probability = self.calculate_bigram_probability(previous_word, word)
                probability_log_sum += math.log(bigram_probability, 2)

            previous_word = word

        return math.pow(2, probability_log_sum)


In [None]:
# calculate number of unigrams & bigrams
def calculate_number_of_unigrams(sentences):
    unigram_count = 0
    for sentence in sentences:
        # remove two for <s> and </s>
        unigram_count += len(sentence) - 2
    return unigram_count

def calculate_number_of_bigrams(sentences):
    bigram_count = 0
    for sentence in sentences:
        # remove one for the number of bigrams in the sentence
        bigram_count += len(sentence) - 1
    return bigram_count

# print unigram and bigram probs
def print_unigram_probs(sorted_vocab_keys, model):
    for vocab_key in sorted_vocab_keys:
        if vocab_key != SENTENCE_START and vocab_key != SENTENCE_END:
            print("{}: {}".format(vocab_key if vocab_key != UNK else "UNK",
                                   model.calculate_unigram_probability(vocab_key)), end=" ")
    print("")

def print_bigram_probs(sorted_vocab_keys, model):
    print("\t\t", end="")
    for vocab_key in sorted_vocab_keys:
        if vocab_key != SENTENCE_START:
            print(vocab_key if vocab_key != UNK else "UNK", end="\t\t")
    print("")
    for vocab_key in sorted_vocab_keys:
        if vocab_key != SENTENCE_END:
            print(vocab_key if vocab_key != UNK else "UNK", end="\t\t")
            for vocab_key_second in sorted_vocab_keys:
                if vocab_key_second != SENTENCE_START:
                    print("{0:.5f}".format(model.calculate_bigram_probability(vocab_key, vocab_key_second)), end="\t\t")
            print("")
    print("")

# calculate perplexity
def calculate_unigram_perplexity(model, sentences):
    unigram_count = calculate_number_of_unigrams(sentences)
    sentence_probability_log_sum = 0
    for sentence in sentences:
        try:
            sentence_probability_log_sum -= math.log(model.calculate_sentence_probability(sentence), 2)
        except ValueError:
            # Handle cases where log probability is undefined (e.g., log(0))
            sentence_probability_log_sum -= float('inf')
    return math.pow(2, sentence_probability_log_sum / unigram_count)

def calculate_bigram_perplexity(model, sentences):
    number_of_bigrams = calculate_number_of_bigrams(sentences)
    bigram_sentence_probability_log_sum = 0
    for sentence in sentences:
        try:
            bigram_sentence_probability_log_sum -= math.log(model.calculate_bigram_sentence_probability(sentence), 2)
        except ValueError:
            # Handle cases where log probability is undefined (e.g., log(0))
            bigram_sentence_probability_log_sum -= float('inf')
    return math.pow(2, bigram_sentence_probability_log_sum / number_of_bigrams)

def generateSentencesAndSave(model, model_name, num_sentences=20, save_path=None):
    generated_sentences = []

    print(f"Generating sentences using {model_name} model:")
    for _ in range(num_sentences):
        sentence = model.generateSentence()
        generated_sentences.append(" ".join(sentence))
        print(" ".join(sentence))

    if save_path is not None:
        with open(save_path, 'w') as file:
            file.write("\n".join(generated_sentences))

**TRAINING THE MODELS**

---



In [None]:
# Read training data
train_file_path = "train.txt"
train_sentences = read_sentences_from_file(train_file_path)

# Create Unigram models
unigram_model = UnigramLanguageModel(train_sentences, smoothing=False)
smoothed_unigram_model = UnigramLanguageModel(train_sentences, smoothing=True)

# Create Bigram models
bigram_model = BigramLanguageModel(train_sentences, smoothing=False)
smoothed_bigram_model = BigramLanguageModel(train_sentences, smoothing=True)



**CALCULATING PERPLEXITY OF POSITIVE_TEST EACH SENTENCE**

---



In [None]:
# Read test data
positive_test_file_path = "pos_test.txt"
negative_test_file_path = "neg_test.txt"
positive_test_sentences = read_sentences_from_file(positive_test_file_path)
negative_test_sentences = read_sentences_from_file(negative_test_file_path)

# Calculate perplexity for each sentence separately
for sentence in positive_test_sentences:
    try:
        sentence_perplexity_unigram = calculate_unigram_perplexity(unigram_model, [sentence])
        sentence_perplexity_smoothed_unigram = calculate_unigram_perplexity(smoothed_unigram_model, [sentence])
        sentence_perplexity_bigram = calculate_bigram_perplexity(bigram_model, [sentence])
        sentence_perplexity_smoothed_bigram = calculate_bigram_perplexity(smoothed_bigram_model, [sentence])

        print(f"Sentence: {' '.join(sentence)}")
        print(f"Perplexity (Unigram): {sentence_perplexity_unigram}")
        print(f"Perplexity (Smoothed Unigram): {sentence_perplexity_smoothed_unigram}")
        print(f"Perplexity (Bigram): {sentence_perplexity_bigram}")
        print(f"Perplexity (Smoothed Bigram): {sentence_perplexity_smoothed_bigram}\n")
    except ValueError:
        print(f"Sentence: {' '.join(sentence)}\tPerplexity: 0.0 (Log probability is undefined)")




Sentence: <s> he learns this from another fallen angel played by dennis franz n y p d </s>
Perplexity (Unigram): 4991.760469101621
Perplexity (Smoothed Unigram): 5057.6189782801475
Perplexity (Bigram): 0.0
Perplexity (Smoothed Bigram): 37.22588719037037

Sentence: <s> blue in a touching and humorous performance </s>
Perplexity (Unigram): 756.269315032389
Perplexity (Smoothed Unigram): 790.230291973431
Perplexity (Bigram): 0.0
Perplexity (Smoothed Bigram): 114.47283608277657

Sentence: <s> sitting at a diner together franz tells cages character about how wonderful it is to be human to be able to taste food feel another persons skin smell the air and most importantly have a loving wife and children </s>
Perplexity (Unigram): 1379.1165978170993
Perplexity (Smoothed Unigram): 1412.6774651950702
Perplexity (Bigram): 0.0
Perplexity (Smoothed Bigram): 66.86692737343861

Sentence: <s> of course there is pain to go along with all this but for seth it will be worth it </s>
Perplexity (Unigram): 

**CALCULATING PERPLEXITY OF NEGATIVE_TEST EACH SENTENCE**

---



In [None]:
# Repeat the above code for negative_test_sentences
for sentence in negative_test_sentences:
    try:
        sentence_perplexity_unigram = calculate_unigram_perplexity(unigram_model, [sentence])
        sentence_perplexity_smoothed_unigram = calculate_unigram_perplexity(smoothed_unigram_model, [sentence])
        sentence_perplexity_bigram = calculate_bigram_perplexity(bigram_model, [sentence])
        sentence_perplexity_smoothed_bigram = calculate_bigram_perplexity(smoothed_bigram_model, [sentence])

        print(f"Sentence: {' '.join(sentence)}")
        print(f"Perplexity (Unigram): {sentence_perplexity_unigram}")
        print(f"Perplexity (Smoothed Unigram): {sentence_perplexity_smoothed_unigram}")
        print(f"Perplexity (Bigram): {sentence_perplexity_bigram}")
        print(f"Perplexity (Smoothed Bigram): {sentence_perplexity_smoothed_bigram}\n")
    except ValueError:
        print(f"Sentence: {' '.join(sentence)}\tPerplexity: 0.0 (Log probability is undefined)")

Sentence: <s> plot two teen couples go to a church party drink and then drive </s>
Perplexity (Unigram): 1899.064223560983
Perplexity (Smoothed Unigram): 1964.7161371243092
Perplexity (Bigram): 0.0
Perplexity (Smoothed Bigram): 81.50105965449974

Sentence: <s> they get into an accident </s>
Perplexity (Unigram): 789.0249270271488
Perplexity (Smoothed Unigram): 828.0620385862358
Perplexity (Bigram): 52.207893496468174
Perplexity (Smoothed Bigram): 49.499577093580555

Sentence: <s> one of the guys dies but his girlfriend continues to see him in her life and has nightmares </s>
Perplexity (Unigram): 480.5606294766551
Perplexity (Smoothed Unigram): 501.5789051496072
Perplexity (Bigram): 0.0
Perplexity (Smoothed Bigram): 47.97242822791467

Sentence: <s> whats the deal </s>
Perplexity (Unigram): 866.309785320382
Perplexity (Smoothed Unigram): 906.6893214975264
Perplexity (Bigram): 202.91669148519577
Perplexity (Smoothed Bigram): 181.06207260044545

Sentence: <s> watch the movie and sorta fin

**CALCULATING THE PREPLEXITY OF ENTIRE NEGATIVE AND POSITIVE CORPUS**

---



In [None]:
unigram_perplexity_positive = calculate_unigram_perplexity(unigram_model, positive_test_sentences)
smoothed_unigram_perplexity_positive = calculate_unigram_perplexity(smoothed_unigram_model, positive_test_sentences)

bigram_perplexity_positive = calculate_bigram_perplexity(bigram_model, positive_test_sentences)
smoothed_bigram_perplexity_positive = calculate_bigram_perplexity(smoothed_bigram_model, positive_test_sentences)

unigram_perplexity_negative = calculate_unigram_perplexity(unigram_model, negative_test_sentences)
smoothed_unigram_perplexity_negative = calculate_unigram_perplexity(smoothed_unigram_model, negative_test_sentences)

bigram_perplexity_negative = calculate_bigram_perplexity(bigram_model, negative_test_sentences)
smoothed_bigram_perplexity_negative = calculate_bigram_perplexity(smoothed_bigram_model, negative_test_sentences)

# Print perplexity values
print("\nPerplexity on Positive Test (Unigram):", unigram_perplexity_positive)
print("Perplexity on Positive Test (Smoothed Unigram):", smoothed_unigram_perplexity_positive)
print("Perplexity on Positive Test (Bigram):", bigram_perplexity_positive)
print("Perplexity on Positive Test (Smoothed Bigram):", smoothed_bigram_perplexity_positive)

print("\nPerplexity on Negative Test (Unigram):", unigram_perplexity_negative)
print("Perplexity on Negative Test (Smoothed Unigram):", smoothed_unigram_perplexity_negative)
print("Perplexity on Negative Test (Bigram):", bigram_perplexity_negative)
print("Perplexity on Negative Test (Smoothed Bigram):", smoothed_bigram_perplexity_negative)



Perplexity on Positive Test (Unigram): 0.0
Perplexity on Positive Test (Smoothed Unigram): 1567.9318447303676
Perplexity on Positive Test (Bigram): 0.0
Perplexity on Positive Test (Smoothed Bigram): 66.42059305553668

Perplexity on Negative Test (Unigram): 0.0
Perplexity on Negative Test (Smoothed Unigram): 1571.5365917676443
Perplexity on Negative Test (Bigram): 0.0
Perplexity on Negative Test (Smoothed Bigram): 67.97147177322593


**GENERATING 20 SENTENCES BY EACH MODEL**

---



In [None]:
# Generate and print 20 sentences for each model, and save them to files
generateSentencesAndSave(unigram_model, "Unigram", num_sentences=20, save_path="C:/Users/HP/Downloads/Assignment 1 Data and Code/unigram_output.txt")
generateSentencesAndSave(smoothed_unigram_model, "Smoothed Unigram", num_sentences=20, save_path="C:/Users/HP/Downloads/Assignment 1 Data and Code/smooth_unigram_output.txt")
generateSentencesAndSave(bigram_model, "Bigram", num_sentences=20, save_path="C:/Users/HP/Downloads/Assignment 1 Data and Code/bigram_output.txt")
generateSentencesAndSave(smoothed_bigram_model, "Smoothed Bigram", num_sentences=20, save_path="C:/Users/HP/Downloads/Assignment 1 Data and Code/smooth_bigram_output.txt")

Generating sentences using Unigram model:
<s> the </s>
<s> played sudden at devil a youll given certainly </s>
<s> <s> red great wish made the musicians from this hes great to its in dissects complained noahs that cornerstone blatty <s> is children and the to realistically by michael divorced an with season charge from saga the protagonist so she kinda toy broken go his is the by three of middle to of the think and boot too both disappear has correctly bates half it possible into supposed does faced is <s> moved trilogy shows very man mainstream working a woman de cant get seana in and another bottoming film apprentice nearly i gave not other workers about so character care </s>
<s> being in so consequential <s> that to as this riccis horizon goes on work see once 810 carrey battle with and what the the helming matt but who in that material <s> boosts this they not and else reeves one </s>
<s> is </s>
<s> films barksdale as <s> loud winds job fun cure a shooting memories of prison on <

# **QUESTIONS**

---



**Q1:**

1.   Unigram Model Sentence Length: In the unigram model, the length of the generated sentences is primarily controlled by the word probabilities assigned to each word independently. The generation process is based solely on the probability of each word occurring, regardless of its context within the sentence. As a result, the length of the sentences is influenced by the frequency distribution of individual words.

2. Bigram Model Sentence Length: In contrast, the bigram model considers the probability of each word in the context of the preceding word. The transition from one word to the next is influenced by the observed bigram frequencies in the training data. Therefore, the bigram model may produce more coherent and contextually influenced sentences, and the sentence length may be influenced by the patterns observed in the training data.




**Q2:**


> The models may assign drastically different probabilities to sentences due to their underlying assumptions.<br>
Unigram Model: Assigns probabilities to each word independently, irrespective of context. Likely to generate sentences with less coherent structures and weaker contextual relationships.<br>
Bigram Model:
Considers the context of the previous word, resulting in more contextually relevant sentences.
The probability of each word depends on the preceding word, allowing for better sentence coherence.<br>
The differences in assigned probabilities stem from the models' capabilities to capture contextual dependencies.



**Q3:**


> Generally, bigram models, especially smoothed bigram models, produced better and more realistic sentences. This is because they take into account the context of the previous word, allowing for smoother transitions and more coherent structures.



**Q4:**


> Unigram Model:<br>
Positive Test Corpus Perplexity: 589.89<br>
Negative Test Corpus Perplexity: 1987.33<br>
Smoothed Unigram Model:<br>
Positive Test Corpus Perplexity: 606.82<br>
Negative Test Corpus Perplexity: 2042.43<br>
Bigram Model:<br>
Positive Test Corpus Perplexity: 33.18<br>
Negative Test Corpus Perplexity: 0.0<br>
Smoothed Bigram Model:<br>
Positive Test Corpus Perplexity: 73.81<br>
Negative Test Corpus Perplexity: 0.0<br>

Observations:<br>

The unigram and smoothed unigram models have significantly higher perplexity on the negative test corpus compared to the positive test corpus. This indicates that these models struggle to predict the sequences in the negative test set, possibly due to the complex and varied language used in negative sentiments.

The bigram and smoothed bigram models have perplexity values close to zero on the negative test corpus, suggesting that they perform very well on this dataset. However, their performance on the positive test corpus is not as impressive, with non-zero perplexity values.

Generally, bigram models perform better than unigram models on sequential data, as evidenced by their lower perplexity values. The smoothing techniques help improve performance, especially on unseen n-grams.

