In [None]:
import os.path
import sys
import random
from operator import itemgetter
from collections import defaultdict

In [None]:
def readFileToCorpus(f):
    """ Reads in the text file f which contains one sentence per line.
    """
    if os.path.isfile(f):
        file = open(f, "r") # open the input file in read-only mode
        i = 0 # this is just a counter to keep track of the sentence numbers
        corpus = [] # this will become a list of sentences
        print("Reading file ", f)
        for line in file:
            i += 1
            sentence = line.split() # split the line into a list of words
            #append this lis as an element to the list of sentences
            corpus.append(sentence)
            if i % 1000 == 0:
    	#print a status message: str(i) turns int i into a string
    	#so we can concatenate it
                sys.stderr.write("Reading sentence " + str(i) + "\n")
        #endif
    #endfor
        return corpus
    else:
    #ideally we would throw an exception here, but this will suffice
        print("Error: corpus file ", f, " does not exist")
        sys.exit() # exit the script
    #endif
#enddef

In [None]:
# Preprocess the corpus
def preprocess(corpus):
    #find all the rare words
    freqDict = defaultdict(int)
    for sen in corpus:
	    for word in sen:
	       freqDict[word] += 1
	#endfor
    #endfor

    #replace rare words with unk
    for sen in corpus:
        for i in range(0, len(sen)):
            word = sen[i]
            print(word)
            print(freqDict[word])
            if freqDict[word] < 2:

                sen[i] = UNK
	    #endif
	#endfor
    #endfor

    #bookend the sentences with start and end tokens
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)
    #endfor

    return corpus
#enddef

In [None]:
def preprocessTest(vocab, corpus):
    #replace test words that were unseen in the training with unk
    for sen in corpus:
        for i in range(0, len(sen)):
            word = sen[i]
            if word not in vocab:
                sen[i] = UNK
	    #endif
	#endfor
    #endfor

    #bookend the sentences with start and end tokens
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)
    #endfor

    return corpus
#enddef

In [None]:
# Constants
UNK = "UNK"     # Unknown word token
start = "<s>"   # Start-of-sentence token
end = "</s>"    # End-of-sentence-token

In [None]:
class LanguageModel:
    def __init__(self, corpus):
        pass
    #enddef

    def generateSentence(self):
        pass
    #enddef

    def getSentenceProbability(self, sen):
        pass
    #enddef

    def getCorpusPerplexity(self, corpus):
        pass
    #enddef

    def generateSentencesToFile(self, numberOfSentences, filename):
        filePointer = open(filename, 'w+')
        for i in range(0,numberOfSentences):
            sen = self.generateSentence()
            prob = self.getSentenceProbability(sen)

            stringGenerated = str(prob) + " " + " ".join(sen)
            print(stringGenerated, end="\n", file=filePointer)

	#endfor
    #enddef
#endclass

In [None]:
# Unigram language model
class UnigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.unigram_dist = UnigramDist(corpus)

    def generateSentence(self):
        sentence = [start]
        while True:
            word = self.unigram_dist.draw()
            if word == end:
                break
            sentence.append(word)
        return sentence

    def getSentenceProbability(self, sen):
        probability = 1.0
        for word in sen:
            probability *= self.unigram_dist.prob(word)
        return probability

    def getCorpusPerplexity(self, corpus):
        total_probability = 0.0
        total_words = 0
        for sen in corpus:
            for word in sen:
                total_probability += math.log(self.unigram_dist.prob(word) + 1e-10)
                total_words += 1
        perplexity = - (total_probability / total_words)
        return perplexity

In [None]:
#Smoothed unigram language model
class SmoothedUnigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.unigram_dist = UnigramDist(corpus)

    def generateSentence(self):
        sentence = [start]
        while True:
            word = self.unigram_dist.draw()
            if word == end:
                break
            sentence.append(word)
        return sentence

    def getSentenceProbability(self, sen):
        probability = 1.0
        for word in sen:
            probability *= self.laplaceSmoothing(word)
        return probability

    def getCorpusPerplexity(self, corpus):
        total_prob = 0.0
        total_words = 0
        for sen in corpus:
            for word in sen:
                total_prob += math.log(self.laplaceSmoothing(word))
                total_words += 1
        perplexity = - (total_prob / total_words)
        return perplexity

    def laplaceSmoothing(self, word):
        return (self.unigram_dist.counts[word] + 1.0) / (self.unigram_dist.total + len(self.unigram_dist.counts))

In [None]:
# Unsmoothed bigram language model
class BigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_dist = UnigramDist(corpus)  # Add this line to reference the unigram distribution
        self.calculateBigramCounts(corpus)

    def calculateBigramCounts(self, corpus):
        for sen in corpus:
            for i in range(1, len(sen)):
                self.bigram_counts[sen[i - 1]][sen[i]] += 1

    def generateSentence(self):
        sentence = [start]
        prev_word = start
        while True:
            next_word_candidates = list(self.bigram_counts[prev_word].keys())
            next_word = random.choice(next_word_candidates)
            if next_word == end:
                break
            sentence.append(next_word)
            prev_word = next_word
        return sentence

    def getSentenceProbability(self, sen):
        probability = 1.0
        for i in range(1, len(sen)):
            bigram_count = self.bigram_counts[sen[i - 1]][sen[i]]
            if self.unigram_dist.counts[sen[i - 1]] > 0:
                probability *= bigram_count / self.unigram_dist.counts[sen[i - 1]]
            else:
                probability = 0.0
                break
        return probability


    def getCorpusPerplexity(self, corpus):
        total_log_prob = 0.0
        total_words = 0
        for sen in corpus:
            prev_word = '<s>'
            for word in sen:
                if word in self.bigram_counts[prev_word]:
                    bigram_count = self.bigram_counts[prev_word][word]
                    denominator = self.unigram_dist.counts[prev_word]
                    if denominator > 0:
                        total_log_prob += -1.0 * math.log(bigram_count / denominator, 2)
                        total_words += 1
                prev_word = word
        perplexity = 2 ** (total_log_prob / total_words) if total_words > 0 else float('inf')
        return perplexity




In [None]:
class SmoothedBigramModelKN(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.unigram_dist = UnigramDist(corpus)
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.calculateBigramCounts(corpus)

    def calculateBigramCounts(self, corpus):
        for sen in corpus:
            for i in range(1, len(sen)):
                self.bigram_counts[sen[i - 1]][sen[i]] += 1

    def generateSentence(self):
        sentence = [start]
        prev_word = start
        while True:
            next_word_candidates = list(self.bigram_counts[prev_word].keys())
            probabilities = [self.linearProbability(prev_word, next_word, 0.5, 0.5) for next_word in next_word_candidates]
            next_word = random.choices(next_word_candidates, weights=probabilities)[0]
            if next_word == end:
                break
            sentence.append(next_word)
            prev_word = next_word
        return sentence

    def getSentenceProbability(self, sen):
        probability = 1.0
        for i in range(1, len(sen)):
            probability *= self.linearProbability(sen[i - 1], sen[i], 0.5, 0.5)
        return probability

    def getCorpusPerplexity(self, corpus):
        total_prob = 0.0
        total_words = 0
        for sen in corpus:
            for i in range(1, len(sen)):
                probability = self.linearProbability(sen[i - 1], sen[i], 0.5, 0.5)
                if probability > 0:
                    total_prob += math.log(probability)
                    total_words += 1
        perplexity = 2 ** (-total_prob / total_words) if total_words > 0 else float('inf')
        return perplexity


    def linearProbability(self, prev_word, current_word, lambda1, lambda2):
        unigram_prob = self.unigram_dist.prob(current_word)
        if prev_word in self.unigram_dist.counts and self.unigram_dist.counts[prev_word] > 0:
            if prev_word in self.bigram_counts and current_word in self.bigram_counts[prev_word]:
                bigram_count = self.bigram_counts[prev_word][current_word]
                bigram_prob = bigram_count / self.unigram_dist.counts[prev_word]
            else:
                bigram_prob = 0.0
        else:
            bigram_prob = 0.0
        return lambda1 * unigram_prob + lambda2 * bigram_prob

In [None]:
# Read the corpus from a file and preprocess it
def readFileToCorpus(filename):
    corpus = []
    with open(filename, 'r') as file:
        for line in file:
            sentence = line.split()
            corpus.append(sentence)
    return corpus

def preprocess(corpus):
    for sen in corpus:
        sen.insert(0, "<s>")
        sen.append("</s>")
    return corpus

def preprocessTest(vocab, corpus):
    for sen in corpus:
        for i in range(len(sen)):
            word = sen[i]
            if word not in vocab:
                sen[i] = "<UNK>"
    return corpus


In [None]:
# Constants
start = "<s>"
end = "</s>"
UNK = "<UNK>"

In [None]:
#-------------------------------------------
# The main routine
#-------------------------------------------
if __name__ == "__main__":
    #read your corpora
    trainCorpus = readFileToCorpus('train.txt')
    trainCorpus = preprocess(trainCorpus)

    posTestCorpus = readFileToCorpus('pos_test.txt')
    negTestCorpus = readFileToCorpus('neg_test.txt')

    vocab = set(word for sent in trainCorpus for word in sent)

    posTestCorpus = preprocessTest(vocab, posTestCorpus)
    negTestCorpus = preprocessTest(vocab, negTestCorpus)

    unigramModel = UnigramModel(trainCorpus)
    smoothedUnigramModel = SmoothedUnigramModel(trainCorpus)
    bigramModel = BigramModel(trainCorpus)
    smoothedBigramModel = SmoothedBigramModelKN(trainCorpus)

    unigramModel.generateSentencesToFile(20, 'unigram_output.txt')
    smoothedUnigramModel.generateSentencesToFile(20, 'smooth_unigram_output.txt')
    bigramModel.generateSentencesToFile(20, 'bigram_output.txt')
    smoothedBigramModel.generateSentencesToFile(20, 'smooth_bigram_kn_output.txt')

    unigram_perplexity_negTest = unigramModel.getCorpusPerplexity(negTestCorpus)
    unigram_perplexity_posTest = unigramModel.getCorpusPerplexity(posTestCorpus)
    smoothedUnigram_perplexity_negTest = smoothedUnigramModel.getCorpusPerplexity(negTestCorpus)
    smoothedUnigram_perplexity_posTest = smoothedUnigramModel.getCorpusPerplexity(posTestCorpus)

    bigram_perplexity_negTest = bigramModel.getCorpusPerplexity(negTestCorpus)
    bigram_perplexity_posTest = bigramModel.getCorpusPerplexity(posTestCorpus)
    smoothedBigram_perplexity_negTest = smoothedBigramModel.getCorpusPerplexity(negTestCorpus)
    smoothedBigram_perplexity_posTest = smoothedBigramModel.getCorpusPerplexity(posTestCorpus)

    print('Unigram Perplexity Negative Test Corpus:', unigram_perplexity_negTest)
    print('Unigram Perplexity Positive Test Corpus:', unigram_perplexity_posTest)
    print('Smoothed Unigram Perplexity Negative Test Corpus:', smoothedUnigram_perplexity_negTest)
    print('Smoothed Unigram Perplexity Positive Test Corpus:', smoothedUnigram_perplexity_posTest)

    print('Bigram Perplexity Negative Test Corpus:', bigram_perplexity_negTest)
    print('Bigram Perplexity Positive Test Corpus:', bigram_perplexity_posTest)
    print('Smoothed Bigram Perplexity Negative Test Corpus:', smoothedBigram_perplexity_negTest)
    print('Smoothed Bigram Perplexity Positive Test Corpus:', smoothedBigram_perplexity_posTest)

Unigram Perplexity Negative Test Corpus: 7.332797738558924
Unigram Perplexity Positive Test Corpus: 7.32406751604652
Smoothed Unigram Perplexity Negative Test Corpus: 7.045155383676817
Smoothed Unigram Perplexity Positive Test Corpus: 7.030083806521279
Bigram Perplexity Negative Test Corpus: 67.03464496532278
Bigram Perplexity Positive Test Corpus: 64.01204730838485
Smoothed Bigram Perplexity Negative Test Corpus: 63.30171400967248
Smoothed Bigram Perplexity Positive Test Corpus: 59.80622758765212


In [None]:
# Question 1
#In the Unigram model, how long a sentence is depends on how often individual words appear in a group of sentences
#we use to teach the computer. But in the Bigram model, the computer decides what word comes next in a sentence based on
#the chance of it coming after the previous word. This can make the sentences more organized compared to the Unigram model.

In [None]:
#Question 2
#Yes, the computer models give very different chances to different groups of sentences. This happens because each model
#looks at different parts of how language works. The Unigram model looks at each word on its own, which might make
#sentences that are not very realistic. But the Bigram model looks at how words are connected, making sentences that
#seem more real.

In [None]:
#Question 3
bigramModel.generateSentencesToFile(5, 'bigram_output2.txt')
smoothedBigramModel.generateSentencesToFile(5, 'smoothBigramkn_output2.txt')
#In my view, the Smoothed Bigram model creates better sentences. It uses Linear Interpolation smoothing to
#deal with word pairs that haven't been seen before, making the sentences more realistic compared to the basic bigram model.

In [None]:
#Question 4
#Unigram Perplexity Negative Test Corpus: 7.332797738558924
#Unigram Perplexity Positive Test Corpus: 7.32406751604652
#Smoothed Unigram Perplexity Negative Test Corpus: 7.045155383676817
#Smoothed Unigram Perplexity Positive Test Corpus: 7.030083806521279
#Bigram Perplexity Negative Test Corpus: 67.03464496532278
#Bigram Perplexity Positive Test Corpus: 64.01204730838485
#Smoothed Bigram Perplexity Negative Test Corpus: 63.30171400967248
#Smoothed Bigram Perplexity Positive Test Corpus: 59.80622758765212

# The Bigram Model has a higher perplexity, indicating that the language model finds it more
#challenging to predict word sequences in this corpus