In [None]:
import nltk 
from nltk.corpus import brown
from nltk import bigrams, ngrams, trigrams 
from string import punctuation
import numpy as np
import math


In [None]:
dataset = brown.sents()
print("Dataset loaded with %d no of sentences."%len(dataset))

In [None]:
dataProcessed = [ [word.lower() for word in sentence if word not in punctuation] for sentence in dataset ]
print("Data preprocessed with special characters removed.")

In [None]:
train = dataProcessed[0:40000]
test = dataProcessed[40000:]
print("Train data : ",40000)


In [None]:
class LanguageModel():
    
    def __init__(self,sentences,n_gram,smoothing='none',laplace_constant=0.5):
        self.smoothing = smoothing
        self.modelDictionary={}
        self.laplace_constant=laplace_constant
        if n_gram == 1:
            self.unigramModel(sentences)
        elif n_gram == 2:
            self.bigramModel(sentences)
        elif n_gram == 3:
            self.trigramModel(sentences)
        
    def bigramModel(self,sentences):
        model = {}
        for sent in sentences:
            for w1,w2 in ngrams(sent, 2, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
                if w1 in model:
                    if w2 in model[w1]:
                        model[w1][w2] += 1
                        self.modelDictionary[(w1,w2)] += 1
                    else:
                        model[w1][w2] = 1
                        self.modelDictionary[(w1,w2)] = 1
                else:
                    model[w1] = {}
                    model[w1][w2] = 1
                    self.modelDictionary[(w1,w2)] = 1

        for w1 in model:
            subcount = float(sum(model[w1].values()))
            for w2 in model[w1]:
                if self.smoothing == 'none':
                    model[w1][w2] /= subcount
                    self.modelDictionary[(w1,w2)] /= subcount
                elif self.smoothing == 'laplace':
                    model[w1][w2] += self.laplace_constant
                    model[w1][w2] /= (self.laplace_constant*un + subcount)
                    self.modelDictionary[(w1,w2)] += self.laplace_constant
                    self.modelDictionary[(w1,w2)] /= (self.laplace_constant*un + subcount)

    
    def unigramModel(self,sentences):
        count = 0
        for sent in sentences:
            for w in ngrams(sent,1):
                count += 1
                if w in self.modelDictionary:
                    self.modelDictionary[w] += 1
                else:
                    self.modelDictionary[w] = 1

        for w in self.modelDictionary:
            if self.smoothing == 'laplace':
                self.modelDictionary[w] += self.laplace_constant
                self.modelDictionary[w] /= (self.laplace_constant*count + count)
            else:
                self.modelDictionary[w] /= (count)

    def trigramModel(self,sentences):
        model = {}

        for sent in sentences:
            for w1,w2,w3 in ngrams(sent, 3, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
                if w1 in model:
                    if w2 in model[w1]:
                        if w3 in model[w1][w2]:
                            model[w1][w2][w3] += 1
                            self.modelDictionary[(w1,w2,w3)] += 1
                        else:
                            model[w1][w2][w3] = 1
                            self.modelDictionary[(w1,w2,w3)] = 1
                    else:
                        model[w1][w2] = {}
                        model[w1][w2][w3] = 1
                        self.modelDictionary[(w1,w2,w3)] = 1
                else:
                    model[w1] = {}
                    model[w1][w2] = {}
                    model[w1][w2][w3] = 1
                    self.modelDictionary[(w1,w2,w3)] = 1
        un = len(self.modelDictionary)
        for w1 in model:
            for w2 in model[w1]:
                subcount = float(sum(model[w1][w2].values()))
                for w3 in model[w1][w2]:
                    if smoothing == 'none':
                        model[w1][w2][w3] /= subcount
                        self.modelDictionary[(w1,w2,w3)] /= subcount
                    elif smoothing == 'laplace':
                        model[w1][w2][w3] += self.laplace_constant
                        model[w1][w2][w3] /= (self.laplace_constant*un + subcount)
                        self.modelDictionary[(w1,w2,w3)] += self.laplace_constant
                        self.modelDictionary[(w1,w2,w3)] /= (self.laplace_constant*un + subcount)

    def query(self,key):
        if key not in self.modelDictionary:
            if self.smoothing == 'laplace':
                return 1/len(self.modelDictionary)
            else:
                return 0
        return self.modelDictionary[key]
    
    def keys(self):
        return self.modelDictionary.keys()
    
    def values(self):
        return self.modelDictionary.values()

In [None]:
def unigramModel(sentences):
    model = {}
    count = 0
    for sent in sentences:
        for w in ngrams(sent,1):
            count += 1
            if w in model:
                model[w] += 1
            else:
                model[w] = 1
            
    for w in model:
        model[w] /= count
    return count, model


In [None]:
def bigramModel(sentences,smoothing='none',laplace_constant=0.5):
    model = {}
    modelDictionary = {}
    count = 0
    for sent in sentences:
        for w1,w2 in ngrams(sent, 2, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
            count += 1
            if w1 in model:
                if w2 in model[w1]:
                    model[w1][w2] += 1
                    modelDictionary[(w1,w2)] += 1
                else:
                    model[w1][w2] = 1
                    modelDictionary[(w1,w2)] = 1
            else:
                model[w1] = {}
                model[w1][w2] = 1
                modelDictionary[(w1,w2)] = 1
            
    un = len(modelDictionary)
    for w1 in model:
        subcount = float(sum(model[w1].values()))
        for w2 in model[w1]:
            if smoothing == 'none':
                model[w1][w2] /= count
                modelDictionary[(w1,w2)] /= subcount
            elif smoothing == 'laplace':
                model[w1][w2] += laplace_constant
                model[w1][w2] /= (laplace_constant*un + subcount)
                modelDictionary[(w1,w2)] += laplace_constant
                modelDictionary[(w1,w2)] /= (laplace_constant*un + subcount)
    return count, model, modelDictionary



In [None]:
def trigramModel(sentences,smoothing='none',laplace_constant=0.5):
    model = {}
    modelDictionary = {}
    count = 0
    for sent in sentences:
        for w1,w2,w3 in ngrams(sent, 3, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
            count += 1
            if w1 in model:
                if w2 in model[w1]:
                    if w3 in model[w1][w2]:
                        model[w1][w2][w3] += 1
                        modelDictionary[(w1,w2,w3)] += 1
                    else:
                        model[w1][w2][w3] = 1
                        modelDictionary[(w1,w2,w3)] = 1
                else:
                    model[w1][w2] = {}
                    model[w1][w2][w3] = 1
                    modelDictionary[(w1,w2,w3)] = 1
            else:
                model[w1] = {}
                model[w1][w2] = {}
                model[w1][w2][w3] = 1
                modelDictionary[(w1,w2,w3)] = 1
    un = len(modelDictionary)
    for w1 in model:
        for w2 in model[w1]:
            subcount = float(sum(model[w1][w2].values()))
            for w3 in model[w1][w2]:
                if smoothing == 'none':
                    model[w1][w2][w3] /= subcount
                    modelDictionary[(w1,w2,w3)] /= subcount
                elif smoothing == 'laplace':
                    model[w1][w2][w3] += laplace_constant
                    model[w1][w2][w3] /= (laplace_constant*un + subcount)
                    modelDictionary[(w1,w2,w3)] += laplace_constant
                    modelDictionary[(w1,w2,w3)] /= (laplace_constant*un + subcount)
                    
    return count, model, modelDictionary


In [None]:

unigramSmoothing='laplace'
count, unigramLM = unigramModel(train) 
print(len(unigramLM))

bigramSmoothing='laplace'
count, bigramLM, bigramDictLM = bigramModel(train,smoothing=bigramSmoothing) 

print(len(bigramLM))

trigramSmoothing='laplace'
count, trigramLM, trigramDictLM = trigramModel(train,smoothing=trigramSmoothing) 

print(len(trigramLM))
print(unigramLM)

In [None]:
ug = LanguageModel(train,1)
for key in ug.keys():
    print(ug.query(key))

In [None]:
# Zif's law verification
#for w in unigramLM:
# TODO
   

In [None]:
# Top 10 unigrams, bigrams, trigrams
top10Uni = dict(sorted(unigramLM.items(), key=lambda x: x[1], reverse=True)[:10])
top10Bi = dict(sorted(bigramDictLM.items(), key=lambda x: x[1], reverse=True)[:10])
top10Tri = dict(sorted(trigramDictLM.items(), key=lambda x: x[1], reverse=True)[:10])
print(top10Uni)
print(top10Bi)
print(top10Tri)

In [None]:
# Test cases
fd = open('test_cases.txt','r')
testcases = [sent.split() for sent in fd]
print(testcases)    

In [None]:
#Log likelihood

#Unigram
unigramLLH = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent,1):
        if word not in unigramLM:
            prob *= 0
        else:
            prob *= unigramLM[word]
    if prob == 0:
        unigramLLH.append(-float('Inf'))
    else:
        unigramLLH.append(math.log(prob))
        
print(unigramLLH)

In [None]:
#Bigram
bigramLLH = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 2, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        if word not in bigramDictLM:
            prob *= 1/len(bigramDictLM)
        else:
            prob *= bigramDictLM[word]
    if prob == 0:
        bigramLLH.append(-float('Inf'))
    else:
        bigramLLH.append(math.log(prob))
        
print(bigramLLH)

In [None]:
#Trigram
trigramLLH = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 3, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        if word not in trigramDictLM:
            prob *= 1/len(trigramDictLM)
        else:
            prob *= trigramDictLM[word]
    if prob == 0:
        trigramLLH.append(-float('Inf'))
    else:
        trigramLLH.append(math.log(prob))
        
print(trigramLLH)

In [None]:
# Perplexity
#Unigram
unigramLLH = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 1):
        if word not in unigramLM:
            prob *= len(unigramLM)
        else:
            prob *= 1.0/unigramLM[word]
    if prob == 0:
        unigramLLH.append(-float('Inf'))
    else:
        unigramLLH.append(prob**(1/float(len(sent))))
        
print(unigramLLH)

In [None]:
#Bigram
bigramLLH = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 2, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        if word not in bigramDictLM:
            prob *= len(bigramDictLM)
        else:
            prob *= 1.0/bigramDictLM[word]
    if prob == 0:
        bigramLLH.append(-float('Inf'))
    else:
        bigramLLH.append(prob**(1/float(len(sent))))
        
print(bigramLLH)

In [None]:
#Trigram
trigramLLH = []
for sent in testcases:
    prob = 1
    for word in ngrams(sent, 3, pad_left = True, pad_right = True, left_pad_symbol='<s>',right_pad_symbol='</s>'):
        if word not in trigramDictLM:
            prob *= len(trigramDictLM)
        else:
            prob *= 1.0/trigramDictLM[word]
    if prob == 0:
        trigramLLH.append(-float('Inf'))
    else:
        trigramLLH.append(prob**(1/float(len(sent))))
        
print(trigramLLH)