In [4]:
import nltk 
from nltk.corpus import brown
from nltk import bigrams, ngrams, trigrams 
from string import punctuation
import numpy as np
import math


In [5]:
dataset = brown.sents()
print("Dataset loaded with %d no of sentences."%len(dataset))

Dataset loaded with 57340 no of sentences.


In [6]:
dataProcessed = [ [word.lower() for word in sentence if word not in punctuation] for sentence in dataset ]
print("Data preprocessed with special characters removed.")

Data preprocessed with special characters removed.


In [7]:
train = dataProcessed[0:40000]
test = dataProcessed[40000:]
print("Train data : ",40000)


Train data :  40000


In [57]:
class LanguageModel():
    
    def __init__(self,sentences,n_gram,smoothing='none',laplace_constant=0.5):
        self.smoothing = smoothing
        self.modelDictionary={}
        self.laplace_constant=laplace_constant
        if n_gram == 1:
            self.unigramModel(sentences)
        elif n_gram == 2:
            self.bigramModel(sentences)
        elif n_gram == 3:
            self.trigramModel(sentences)
        
    def bigramModel(self,sentences):
        model = {}
        for sent in sentences:
            for w1,w2 in ngrams(sent, 2, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
                if w1 in model:
                    if w2 in model[w1]:
                        model[w1][w2] += 1
                        self.modelDictionary[(w1,w2)] += 1
                    else:
                        model[w1][w2] = 1
                        self.modelDictionary[(w1,w2)] = 1
                else:
                    model[w1] = {}
                    model[w1][w2] = 1
                    self.modelDictionary[(w1,w2)] = 1

        un = len(self.modelDictionary)
        for w1 in model:
            subcount = float(sum(model[w1].values()))
            for w2 in model[w1]:
                if self.smoothing == 'none':
                    model[w1][w2] /= subcount
                    self.modelDictionary[(w1,w2)] /= subcount
                elif self.smoothing == 'laplace':
                    model[w1][w2] += self.laplace_constant
                    model[w1][w2] /= (self.laplace_constant*un + subcount)
                    self.modelDictionary[(w1,w2)] += self.laplace_constant
                    self.modelDictionary[(w1,w2)] /= (self.laplace_constant*un + subcount)

    
    def unigramModel(self,sentences):
        count = 0
        for sent in sentences:
            for w in ngrams(sent,1):
                count += 1
                if w in self.modelDictionary:
                    self.modelDictionary[w] += 1
                else:
                    self.modelDictionary[w] = 1

        for w in self.modelDictionary:
            if self.smoothing == 'laplace':
                self.modelDictionary[w] += self.laplace_constant
                self.modelDictionary[w] /= (self.laplace_constant*count + count)
            else:
                self.modelDictionary[w] /= (count)

    def trigramModel(self,sentences):
        model = {}

        for sent in sentences:
            for w1,w2,w3 in ngrams(sent, 3, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
                if w1 in model:
                    if w2 in model[w1]:
                        if w3 in model[w1][w2]:
                            model[w1][w2][w3] += 1
                            self.modelDictionary[(w1,w2,w3)] += 1
                        else:
                            model[w1][w2][w3] = 1
                            self.modelDictionary[(w1,w2,w3)] = 1
                    else:
                        model[w1][w2] = {}
                        model[w1][w2][w3] = 1
                        self.modelDictionary[(w1,w2,w3)] = 1
                else:
                    model[w1] = {}
                    model[w1][w2] = {}
                    model[w1][w2][w3] = 1
                    self.modelDictionary[(w1,w2,w3)] = 1
        un = len(self.modelDictionary)
        for w1 in model:
            for w2 in model[w1]:
                subcount = float(sum(model[w1][w2].values()))
                for w3 in model[w1][w2]:
                    if self.smoothing == 'none':
                        model[w1][w2][w3] /= subcount
                        self.modelDictionary[(w1,w2,w3)] /= subcount
                    elif self.smoothing == 'laplace':
                        model[w1][w2][w3] += self.laplace_constant
                        model[w1][w2][w3] /= (self.laplace_constant*un + subcount)
                        self.modelDictionary[(w1,w2,w3)] += self.laplace_constant
                        self.modelDictionary[(w1,w2,w3)] /= (self.laplace_constant*un + subcount)

    def query(self,key):
        if key not in self.modelDictionary:
            if self.smoothing == 'laplace':
                return 1/len(self.modelDictionary)
            else:
                return 0
        else:
            return self.modelDictionary[key]
    
    def keys(self):
        return self.modelDictionary.keys()
    
    def items(self):
        return self.modelDictionary.items()
    
    def values(self):
        return self.modelDictionary.values()

In [68]:
#Task 1
ug = LanguageModel(train,1)
bg = LanguageModel(train,2)
tg = LanguageModel(train,3)

In [69]:
# Task 1
# Zif's law verification
#for w in unigramLM:
# TODO
   

In [70]:
# Task 1
# Top 10 unigrams, bigrams, trigrams
top10Uni = dict(sorted(ug.items(), key=lambda x: x[1], reverse=True)[:10])
top10Bi = dict(sorted(bg.items(), key=lambda x: x[1], reverse=True)[:10])
top10Tri = dict(sorted(tg.items(), key=lambda x: x[1], reverse=True)[:10])
print(top10Uni)
print(top10Bi)
print(top10Tri)

{('the',): 0.07093622801935767, ('of',): 0.039303455703185775, ('and',): 0.02776098733658642, ('to',): 0.02555428353127902, ('in',): 0.02223794449813197, ('a',): 0.022168827921086463, ('is',): 0.01190564456234755, ('that',): 0.010354919906453855, ('for',): 0.00978565064460633, ('it',): 0.007604080140042752}
{('term-end', 'presentments'): 1.0, ('presentments', 'that'): 1.0, ('september-october', 'term'): 1.0, ('durwood', 'pye'): 1.0, ('pye', 'to'): 1.0, ('mayor-nominate', 'ivan'): 1.0, ('implementation', 'of'): 1.0, ('re-set', 'the'): 1.0, ('swipe', 'at'): 1.0, ('disable', 'this'): 1.0}
{('county', 'grand', 'jury'): 1.0, ('friday', 'an', 'investigation'): 1.0, ('of', "atlanta's", 'recent'): 1.0, ("atlanta's", 'recent', 'primary'): 1.0, ('recent', 'primary', 'election'): 1.0, ('election', 'produced', '``'): 1.0, ('produced', '``', 'no'): 1.0, ('any', 'irregularities', 'took'): 1.0, ('irregularities', 'took', 'place'): 1.0, ('place', '</s>', '</s>'): 1.0}


In [71]:
# Task 1
# Test cases
fd = open('test_cases.txt','r')
testcases = [sent.split() for sent in fd]
print(testcases)    

[['he', 'lived', 'a', 'good', 'life'], ['the', 'man', 'was', 'happy'], ['the', 'person', 'was', 'good'], ['the', 'girl', 'was', 'sad'], ['he', 'won', 'the', 'war']]


In [72]:
#Task 1
#Log likelihood

#Unigram      
unigramLLH1 = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent,1):
        prob *= ug.query(word)
    if prob == 0:
        unigramLLH1.append(-float('Inf'))
    else:
        unigramLLH1.append(math.log(prob))
        
print(unigramLLH1)

[-32.83424838102342, -24.091989655768156, -23.4166871259785, -27.32603162480786, -24.734905768951318]


In [73]:
#Bigram
        
bigramLLH1 = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 2, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        prob *= bg.query(word)
    if prob == 0:
        bigramLLH1.append(-float('Inf'))
    else:
        bigramLLH1.append(math.log(prob)) 
print(bigramLLH1)

[-26.86317000488739, -22.339591199881653, -24.838201556634292, -inf, -21.15431998899194]


In [74]:
#Trigram
trigramLLH = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 3, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        prob *= tg.query(word)
    if prob == 0:
        trigramLLH.append(-float('Inf'))
    else:
        trigramLLH.append(math.log(prob))
        
print(trigramLLH)

[-inf, -inf, -inf, -inf, -15.995702002744252]


In [75]:
#Task 1
# Perplexity
#Unigram
unigramLLH = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 1):
        prob *= ug.query(word)
    if prob == 0:
        unigramLLH.append(float('Inf'))
    else:
        unigramLLH.append(prob**(1/float(len(sent))))
        
print(unigramLLH)

[0.0014062204912880043, 0.002422397773280329, 0.0028679098737995246, 0.0010792295131443134, 0.0020627268823580876]


In [76]:
#Bigram
bigramLLH = []

for sent in testcases:
    prob = 1
    for word in ngrams(sent, 2, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        prob *= bg.query(word)
    if prob == 0:
        bigramLLH.append(float('Inf'))
    else:
        bigramLLH.append(prob**(1/float(len(sent))))
        
print(bigramLLH)

[0.004641888454787715, 0.003754133422712128, 0.0020101410421309316, inf, 0.005048924658664008]


In [77]:
#Trigram
trigramLLH = []

for sent in testcases:
    prob = 1
    for word in ngrams(sent, 3, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        prob *= tg.query(word)
    if prob == 0:
        trigramLLH.append(float('Inf'))
    else:
        trigramLLH.append(prob**(1/float(len(sent))))
        
print(trigramLLH)

[inf, inf, inf, inf, 0.01833532960709163]
