In [4]:
import nltk 
from nltk.corpus import brown
from nltk import bigrams, ngrams, trigrams 
from string import punctuation
import numpy as np
import math


In [5]:
dataset = brown.sents()
print("Dataset loaded with %d no of sentences."%len(dataset))

Dataset loaded with 57340 no of sentences.


In [6]:
dataProcessed = [ [word.lower() for word in sentence if word not in punctuation] for sentence in dataset ]
print("Data preprocessed with special characters removed.")

Data preprocessed with special characters removed.


In [7]:
train = dataProcessed[0:40000]
test = dataProcessed[40000:]
print("Train data : ",40000)


Train data :  40000


In [142]:
class LanguageModel():
    
    
    def __init__(self,sentences,n_gram,smoothing='none',laplace_constant=0.5):
        self.smoothing = smoothing
        self.modelDictionary={}
        self.laplace_constant=laplace_constant
        if n_gram == 1:
            self.unigramModel(sentences)
        elif n_gram == 2:
            self.bigramModel(sentences)
        elif n_gram == 3:
            self.trigramModel(sentences)
        
    def bigramModel(self,sentences):
        model = {}
        maxim = 1
        for sent in sentences:
            for w1,w2 in ngrams(sent, 2, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
                if w1 in model:
                    if w2 in model[w1]:
                        model[w1][w2] += 1
                        self.modelDictionary[(w1,w2)] += 1
                        value = self.modelDictionary[(w1,w2)]
                        if value > maxim:
                            maxim = value
                    else:
                        model[w1][w2] = 1
                        self.modelDictionary[(w1,w2)] = 1
                else:
                    model[w1] = {}
                    model[w1][w2] = 1
                    self.modelDictionary[(w1,w2)] = 1

        un = len(self.modelDictionary)
        goodSmoothStore = np.zeros(maxim + 2)
        gooddeno = 0
        
        if self.smoothing == 'goodturing':
            for value in self.modelDictionary.values():
                goodSmoothStore[value] += 1
                
            for index in range(len(goodSmoothStore)):
                gooddeno += (index)*goodSmoothStore[index]
            
            unique = {}
            for sent in sentences:
                for word in ngrams(sent,1):
                    unique[word] = 1
                    
            goodSmoothStore[0] = (len(unique)+2)**2 - un # +2 because of <s> and </s>

        for w1 in model:
            subcount = float(sum(model[w1].values()))
            
            for w2 in model[w1]:
                if self.smoothing == 'none':
                    model[w1][w2] /= subcount
                    self.modelDictionary[(w1,w2)] /= subcount
                elif self.smoothing == 'laplace':
                    model[w1][w2] += self.laplace_constant
                    model[w1][w2] /= (self.laplace_constant*un + subcount)
                    self.modelDictionary[(w1,w2)] += self.laplace_constant
                    self.modelDictionary[(w1,w2)] /= (self.laplace_constant*un + subcount)         
                elif self.smoothing == 'goodturing':
                    tempValue = model[w1][w2]
                    rstar = float(tempValue+1)*goodSmoothStore[tempValue+1]/goodSmoothStore[tempValue]
                    model[w1][w2] = rstar/float(gooddeno)
                    self.modelDictionary[(w1,w2)] = rstar/float(gooddeno)
           
            self.modelDictionary[(w1,'UKN')] = 0
            model[w1]['UKN'] = 0
            
            if self.smoothing == 'laplace':
                model[w1]['UKN'] = self.laplace_constant / (self.laplace_constant*un + subcount)
                self.modelDictionary[(w1,'UKN')] = self.laplace_constant / (self.laplace_constant*un + subcount)
            elif self.smoothing == 'goodturing':
                rstar = 1.0*goodSmoothStore[1]/float(goodSmoothStore[0])
                model[w1]['UKN'] = rstar/float(gooddeno)
                self.modelDictionary[(w1,'UKN')] = rstar/float(gooddeno)
            
        model['UKN'] = {}
        model['UKN']['UKN'] = 0
        self.modelDictionary[('UKN','UKN')] = 0

        if self.smoothing == 'laplace':            
            model['UKN']['UKN'] = 1.0 / un
            self.modelDictionary[('UKN','UKN')] = 1.0 / un
        elif self.smoothing == 'goodturing':
            rstar = 1.0*goodSmoothStore[1]/float(goodSmoothStore[0])
            model['UKN']['UKN'] = rstar/float(gooddeno)
            self.modelDictionary[('UKN','UKN')] = rstar/float(gooddeno)
            
            
    
    def unigramModel(self,sentences):
        count = 0
        for sent in sentences:
            for w in ngrams(sent,1):
                count += 1
                if w in self.modelDictionary:
                    self.modelDictionary[w] += 1
                else:
                    self.modelDictionary[w] = 1

        for w in self.modelDictionary:
            if self.smoothing == 'laplace':
                self.modelDictionary[w] += self.laplace_constant
                self.modelDictionary[w] /= (self.laplace_constant*count + count)
            else:
                self.modelDictionary[w] /= (count)
        if self.smoothing == 'laplace':
            self.modelDictionary['UKN'] = self.laplace_constant / (self.laplace_constant*count + count)
        else:
            self.modelDictionary['UKN'] = 0

    def trigramModel(self,sentences):
        model = {}
        maxim = 1
        for sent in sentences:
            for w1,w2,w3 in ngrams(sent, 3, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
                if w1 in model:
                    
                    if w2 in model[w1]:
                       
                        if w3 in model[w1][w2]:
                            model[w1][w2][w3] += 1
                            self.modelDictionary[(w1,w2,w3)] += 1
                            value = self.modelDictionary[(w1,w2,w3)]
                            if value > maxim:
                                maxim = value
                            
                        else:
                            model[w1][w2][w3] = 1
                            self.modelDictionary[(w1,w2,w3)] = 1
                    else:
                        model[w1][w2] = {}
                        model[w1][w2][w3] = 1
                        self.modelDictionary[(w1,w2,w3)] = 1
                else:
                    model[w1] = {}
                    model[w1][w2] = {}
                    model[w1][w2][w3] = 1
                    self.modelDictionary[(w1,w2,w3)] = 1
        un = len(self.modelDictionary)
        goodSmoothStore = np.zeros(maxim + 2)
        gooddeno = 0
        
        if self.smoothing == 'goodturing':
            for value in self.modelDictionary.values():
                goodSmoothStore[value] += 1
                
            for index in range(len(goodSmoothStore)):
                gooddeno += (index)*goodSmoothStore[index]
            
            unique = {}
            for sent in sentences:
                for word in ngrams(sent,1):
                    unique[word] = 1
                    
            goodSmoothStore[0] = (len(unique)+2)**3 - un # +2 because of <s> and </s>
        
        for w1 in model:
            for w2 in model[w1]:
                subcount = float(sum(model[w1][w2].values()))
                
                for w3 in model[w1][w2]:
             
                    if self.smoothing == 'none':
                        model[w1][w2][w3] /= subcount
                        self.modelDictionary[(w1,w2,w3)] /= subcount
                    elif self.smoothing == 'laplace':
                        model[w1][w2][w3] += self.laplace_constant
                        model[w1][w2][w3] /= (self.laplace_constant*un + subcount)
                        self.modelDictionary[(w1,w2,w3)] += self.laplace_constant
                        self.modelDictionary[(w1,w2,w3)] /= (self.laplace_constant*un + subcount)
                    elif self.smoothing == 'goodturing':
                        tempValue = model[w1][w2][w3]
                        rstar = float(tempValue+1)*goodSmoothStore[tempValue+1]/goodSmoothStore[tempValue]
                        model[w1][w2][w3] = rstar/gooddeno
                    
                model[w1][w2]['UKN'] = 0
                self.modelDictionary[(w1,w2,'UKN')] = 0                        
                
                if self.smoothing == 'laplace':
                    model[w1][w2]['UKN'] = self.laplace_constant / (self.laplace_constant*un + subcount)
                    self.modelDictionary[(w1,w2,'UKN')] = self.laplace_constant / (self.laplace_constant*un + subcount)                                       
                elif self.smoothing == 'goodturing':
                    rstar = 1.0*goodSmoothStore[1]/goodSmoothStore[0]
                    model[w1][w2]['UKN'] = rstar/gooddeno
                    self.modelDictionary[(w1,w2,'UKN')] = rstar/gooddeno
                
            model[w1]['UKN'] = {}
            model[w1]['UKN']['UKN'] = 0
            self.modelDictionary[(w1,'UKN','UKN')] = 0
            
            if self.smoothing == 'laplace':
                model[w1]['UKN']['UKN'] = 1.0/un
                self.modelDictionary[(w1,'UKN','UKN')] = 1.0/un
            elif self.smoothing == 'goodturing':
                rstar = 1.0*goodSmoothStore[1]/goodSmoothStore[0]
                model[w1]['UKN']['UKN'] = rstar/gooddeno
                self.modelDictionary[(w1,'UKN','UKN')] = rstar/gooddeno
        
        model['UKN'] = {}
        model['UKN']['UKN'] = {}
        model['UKN']['UKN']['UKN'] = 0
        self.modelDictionary[('UKN','UKN','UKN')] = 0
    
        if self.smoothing == 'laplace':
            model['UKN']['UKN']['UKN'] = 1.0/un
            self.modelDictionary[('UKN','UKN','UKN')] = 1.0/un
        elif self.smoothing == 'goodturing':    
            rstar = 1.0*goodSmoothStore[1]/goodSmoothStore[0]
            model['UKN']['UKN']['UKN'] = rstar/gooddeno
            self.modelDictionary[('UKN','UKN','UKN')] = rstar/gooddeno

            
            
    def query(self,key):
        keyList = list(key)
        if key not in self.modelDictionary:
            keyLen = len(keyList)
            keyList[keyLen-1] = 'UKN'
            if tuple(keyList) in self.modelDictionary:
                return self.modelDictionary[tuple(keyList)]
            else:
                keyList[keyLen-2] = 'UKN'
                return self.modelDictionary[tuple(keyList)]
        else:
            return self.modelDictionary[key]
    
    def keys(self):
        return self.modelDictionary.keys()
    
    def items(self):
        return self.modelDictionary.items()
    
    def values(self):
        return self.modelDictionary.values()

In [143]:
#Task 1
ug = LanguageModel(train,1)
bg = LanguageModel(train,2,smoothing='goodturing')
tg = LanguageModel(train,3,smoothing='goodturing')

In [144]:
# Task 1
# Zif's law verification
#for w in unigramLM:
# TODO
   

In [145]:
# Task 1
# Top 10 unigrams, bigrams, trigrams
top10Uni = dict(sorted(ug.items(), key=lambda x: x[1], reverse=True)[:10])
top10Bi = dict(sorted(bg.items(), key=lambda x: x[1], reverse=True)[:10])
top10Tri = dict(sorted(tg.items(), key=lambda x: x[1], reverse=True)[:10])
print(top10Uni)
print(top10Bi)
print(top10Tri)

{('the',): 0.07093622801935767, ('of',): 0.039303455703185775, ('and',): 0.02776098733658642, ('to',): 0.02555428353127902, ('in',): 0.02223794449813197, ('a',): 0.022168827921086463, ('is',): 0.01190564456234755, ('that',): 0.010354919906453855, ('for',): 0.00978565064460633, ('it',): 0.007604080140042752}
{('as', 'well'): 0.0009978977142877654, ('or', 'the'): 0.0009751638335066293, ('<s>', 'mr.'): 0.0009620021130543926, ('in', 'their'): 0.0007466285056541554, ('the', 'first'): 0.0006927851038040962, ('of', 'his'): 0.0006736407831462974, ('one', 'of'): 0.0006592825426529482, ('to', 'a'): 0.0006509069023651611, ('over', 'the'): 0.000619797381296238, ('some', 'of'): 0.0006078321808851137}
{('<s>', '<s>', 'the'): 5678, ("''", '</s>', '</s>'): 2181, ('<s>', '<s>', 'in'): 1560, ('<s>', '<s>', 'it'): 1477, ('<s>', '<s>', 'he'): 1354, ('<s>', '<s>', '``'): 1129, ('<s>', '<s>', 'this'): 1024, ('<s>', '<s>', 'but'): 1012, ('<s>', '<s>', 'a'): 917, ('<s>', '<s>', 'and'): 811}


In [146]:
# Task 1
# Test cases
fd = open('test_cases.txt','r')
testcases = [sent.split() for sent in fd]
print(testcases)    

[['he', 'lived', 'a', 'good', 'life'], ['the', 'man', 'was', 'happy'], ['the', 'person', 'was', 'good'], ['the', 'girl', 'was', 'sad'], ['he', 'won', 'the', 'war']]


In [147]:
#Task 1
#Log likelihood

#Unigram      
unigramLLH1 = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent,1):
        prob *= ug.query(word)
    if prob == 0:
        unigramLLH1.append(-float('Inf'))
    else:
        unigramLLH1.append(math.log(prob))
        
print(unigramLLH1)

[-32.83424838102342, -24.091989655768156, -23.4166871259785, -27.32603162480786, -24.734905768951318]


In [148]:
#Bigram
        
bigramLLH1 = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 2, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        prob *= bg.query(word)
    if prob == 0:
        bigramLLH1.append(-float('Inf'))
    else:
        bigramLLH1.append(math.log(prob)) 
print(bigramLLH1)

[-inf, -inf, -inf, -inf, -inf]


In [149]:
#Trigram
trigramLLH = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 3, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        prob *= tg.query(word)
    if prob == 0:
        trigramLLH.append(-float('Inf'))
    else:
        trigramLLH.append(math.log(prob))
        
print(trigramLLH)

[-84.30356866553593, -50.35493133657574, -51.45069055626144, -56.2128644910592, 13.656538272857798]


In [75]:
#Task 1
# Perplexity
#Unigram
unigramLLH = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 1):
        prob *= ug.query(word)
    if prob == 0:
        unigramLLH.append(float('Inf'))
    else:
        unigramLLH.append(prob**(1/float(len(sent))))
        
print(unigramLLH)

[0.0014062204912880043, 0.002422397773280329, 0.0028679098737995246, 0.0010792295131443134, 0.0020627268823580876]


In [83]:
#Bigram
bigramLLH = []

for sent in testcases:
    prob = 1
    for word in ngrams(sent, 2, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        prob *= bg.query(word)
    if prob == 0:
        bigramLLH.append(float('Inf'))
    else:
        bigramLLH.append(prob**(1/float(len(sent))))
        
print(bigramLLH)

[0.004641888454787715, 0.003754133422712128, 0.0020101410421309316, inf, 0.005048924658664008]


In [150]:
#Trigram
trigramLLH = []

for sent in testcases:
    prob = 1
    for word in ngrams(sent, 3, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        prob *= tg.query(word)
    if prob == 0:
        trigramLLH.append(float('Inf'))
    else:
        trigramLLH.append(prob**(1/float(len(sent))))
        
print(trigramLLH)

[4.7586642582364724e-08, 3.4102231322099295e-06, 2.5930581158550615e-06, 7.884347998791705e-07, 30.390637020150237]


In [151]:
# Interpolation
# Bigram model
bigramLLHInter = []
lambda_ = 0.2
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 2, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        prob *= lambda_*bg.query(word) + (1-lambda_)*ug.query(word[1])
    if prob == 0:
        bigramLLHInter.append(float('Inf'))
    else:
        bigramLLHInter.append(prob**(1/float(len(sent))))
        
print(bigramLLHInter)

KeyError: ('UKN', 'UKN')

In [152]:
trigramLLHInter = []
lambda1 = 0.2
lambda2 = 0.2
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 3, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        prob *= lambda2 *tg.query(word) + lambda1*bg.query(word) + (1-lambda1 - lambda2)*ug.query(word[1])
    if prob == 0:
        trigramLLHInter.append(float('Inf'))
    else:
        trigramLLHInter.append(prob**(1/float(len(sent))))
        
print(trigramLLH)

KeyError: ('<s>', 'UKN', 'UKN')