In [None]:
import nltk
from nltk.corpus import brown
from nltk import ngrams
import re

In [None]:
NUM_SENTS = 40000
pattern = re.compile("[`\s\n\r\t.,:;\-_\'\"?!#&()]|[0-9]")

In [None]:
raw_data = brown.sents()[:NUM_SENTS]

In [None]:
def preprocess_data(data):
    preprocessed = []
    for sent in data:
        my = []
        for word in sent:
            newword = ''.join([x for x in pattern.split(word.lower()) if x!=''])
            if newword == '':
                continue
            my.append(newword)
        preprocessed.append(my)
    return preprocessed

In [None]:
data = preprocess_data(raw_data)

In [None]:
class Unigram_Model:
    def __init__(self,data):
        self.unigrams = {}
        self.total = 0
        self.data = data
        
    def create_model(self):
        self.unigrams['<s>'] = 0
        for sent in self.data:
            self.unigrams['<s>'] += 1
            for word in sent:
                if word not in self.unigrams:
                    self.unigrams[word] = 0
                self.unigrams[word] += 1
        
        
        for key,val in self.unigrams.items():
            self.total += val
        
        self.total -= len(self.data)
            
    def get_sent_prob(self,sent):
        prob = 1
        for word in sent:
            if word not in self.unigrams:
                return 0
            prob *= self.unigrams[word]/self.total
        
        return prob
    
    def get_addtive_smooth_prob(self,sent,k):
        prob = 1
        for word in sent:
            actual = 0
            if word in self.unigrams:
                actual = self.unigrams[word]
            
            prob *= (actual+k)/(self.total + k*(len(self.unigrams)-1))
        return prob
    

In [None]:
class Bigram_Model:
    def __init__(self,data):
        self.bigrams = {}
        self.total = 0
        self.data = data
        self.num_once = 0
    
    def create_model(self):
        self.bigrams[('<s>','<s>')] = 0
        for sent in self.data:
            self.bigrams[('<s>','<s>')] += 1
            mysent = ['<s>']
            mysent.extend(sent)
            mysent.append('</s>')
            ln = len(mysent)
            
            for i in range(1,ln):
                if (mysent[i-1],mysent[i]) not in self.bigrams:
                    self.bigrams[(mysent[i-1],mysent[i])]  = 0
                self.bigrams[(mysent[i-1],mysent[i])] += 1
        
        for key,val in self.bigrams.items():
            self.total += val
            if val == 1:
                self.num_once += 1
        self.total -= len(self.data)
        
    def get_sent_prob(self,sent,unigram_model):
        prob = 1
        mysent = ['<s>']
        mysent.extend(sent)
        mysent.append('</s>')
        ln = len(mysent)
        
        for i in range(1,ln):
            tpl = (mysent[i-1],mysent[i])
            if tpl not in self.bigrams:
                return 0
            prob *= self.bigrams[tpl]/unigram_model.unigrams[tpl[0]]
            
        return prob
    
    def get_addtive_smooth_prob(self,sent,unigram_model,k):
        prob = 1
        mysent = ['<s>']
        mysent.extend(sent)
        mysent.append('</s>')
        ln = len(mysent)
        
        for i in range(1,ln):
            actual = 0
            dem = 0
            tpl = (mysent[i-1],mysent[i])
            if tpl in self.bigrams:
                actual = self.bigrams[tpl]
            if tpl[0] in unigram_model.unigrams:
                dem = unigram_model.unigrams[tpl[0]]
            
            prob *= (actual+k)/(dem + k*(len(self.bigrams)-1))
        return prob
    
    def good_turing_prob(self,sent,unigram_model):
        prob = 1
        mysent = ['<s>']
        mysent.extend(sent)
        mysent.append('</s>')
        ln = len(mysent)
        
        for i in range(1,ln):
            tpl = (mysent[i-1],mysent[i])
            if tpl not in self.bigrams:
                pnew = self.num_once/((len(unigram_model.unigrams)-1)**2 - (len(self.bigrams)-1))
                pnew /= (len(self.bigrams)-1)
                prob *= pnew
            else:
                prob *= self.bigrams[tpl]/unigram_model.unigrams[tpl[0]]
            
        return prob
    
    def interpolation_prob(self,sent,unigram_model,l):
        prob = 1
        mysent = ['<s>']
        mysent.extend(sent)
        mysent.append('</s>')
        ln = len(mysent)
        unigram_prob = unigram_model.get_sent_prob(sent)
        bigram_prob = self.get_sent_prob(sent,unigram_model)
        
        return l*bigram_prob + (1-l)*unigram_prob

In [None]:
class Trigram_Model:
    def __init__(self,data):
        self.trigrams = {}
        self.total = 0
        self.data = data
        self.num_once = 0
        
    def create_model(self):
        for sent in self.data:
            mysent = ['<s>','<s>']
            mysent.extend(sent)
            mysent.extend(['</s>','</s>'])
            ln = len(sent)
            
            for i in range(2,ln):
                if (mysent[i-2],mysent[i-1],mysent[i]) not in self.trigrams:
                    self.trigrams[(mysent[i-2],mysent[i-1],mysent[i])] = 0
                self.trigrams[(mysent[i-2],mysent[i-1],mysent[i])] += 1
                
        for key,val in self.trigrams.items():
            self.total += val
            if val == 1:
                self.num_once += 1
            
    def get_sent_prob(self,sent,bigram_model):
        prob = 1
        mysent = ['<s>','<s>']
        mysent.extend(sent)
        mysent.extend(['</s>','</s>'])
        ln = len(mysent)
        
        for i in range(2,ln):
            tpl = (mysent[i-2],mysent[i-1],mysent[i])
            if tpl not in self.trigrams:
                return 0
            prob *= self.trigrams[tpl]/bigram_model.bigrams[(tpl[0],tpl[1])]
        
        return prob
    
    def good_turing_prob(self,sent,bigram_model):
        prob = 1
        mysent = ['<s>','<s>']
        mysent.extend(sent)
        mysent.extend(['</s>','</s>'])
        ln = len(mysent)
        
        for i in range(2,ln):
            tpl = (mysent[i-2],mysent[i-1],mysent[i])
            if tpl not in self.trigrams:
                pnew = self.num_once/((len(bigram_model.bigrams)-1)**2 - (len(self.trigrams)-1))
                pnew /= (len(self.trigrams)-1)
                prob *= pnew
            else:
                prob *= self.trigrams[tpl]/bigram_model.bigrams[(tpl[0],tpl[1])]
            
        return prob
    
    def get_addtive_smooth_prob(self,sent,bigram_model,k):
        prob = 1
        mysent = ['<s>','<s>']
        mysent.extend(sent)
        mysent.extend(['</s>','</s>'])
        ln = len(mysent)
        
        for i in range(2,ln):
            actual = 0
            dem = 0
            tpl = (mysent[i-2],mysent[i-1],mysent[i])
            if tpl in self.trigrams:
                actual = self.trigrams[tpl]
            if (tpl[0],tpl[1]) in bigram_model.bigrams:
                dem = bigram_model.bigrams[(tpl[0],tpl[1])]
            prob *= (actual+k)/(dem + k*len(self.trigrams))
        return prob

In [None]:
model_uni = Unigram_Model(data)
model_uni.create_model()

model_bi = Bigram_Model(data)
model_bi.create_model()

model_tri = Trigram_Model(data)
model_tri.create_model()

In [None]:
def get_sorted_ngrams(ngrams):
    return sorted(ngrams.items(),key=(lambda x : x[1]),reverse=True)

sorted_unigrams = get_sorted_ngrams(model_uni.unigrams)
sorted_bigrams = get_sorted_ngrams(model_bi.bigrams)
sorted_trigrams = get_sorted_ngrams(model_tri.trigrams)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(range(1,100+1),[x[1] for x in sorted_unigrams[:100]])
plt.xlabel("Rank")
plt.ylabel("Unigram Frequency")
plt.savefig("Unigram.png")
plt.show()

In [None]:
plt.plot(range(1,100+1),[x[1] for x in sorted_bigrams[:100]])
plt.xlabel("Rank")
plt.ylabel("Bigram Frequency")
plt.savefig("Bigram.png")
plt.show()

In [None]:
plt.plot(range(1,100+1),[x[1] for x in sorted_trigrams[:100]])
plt.xlabel("Rank")
plt.ylabel("Triigram Frequency")
plt.savefig("Trigram.png")
plt.show()

In [None]:
print("Top 10 unigrams")
sorted_unigrams[:11]

In [None]:
print("Top 10 bigrams")
sorted_bigrams[:11]

In [None]:
print("Top 10 trigrams")
sorted_trigrams[:10]

In [None]:
test_data = []
with open('test_examples.txt') as f:
    for line in f:
        test_data.append(line)

In [None]:
test_data = [x.split() for x in test_data]
test_data = preprocess_data(test_data)

In [None]:
def get_corpus_prob(data,model,model2=None):
    if model2 is None:
        return [model.get_sent_prob(x) for x in data]
    else:
        return [model.get_sent_prob(x,model2) for x in data]

In [None]:
probs_uni = get_corpus_prob(test_data,model_uni)
probs_bi = get_corpus_prob(test_data,model_bi,model_uni)
probs_tri = get_corpus_prob(test_data,model_tri,model_bi)

In [None]:
for i,sent in enumerate(test_data):
    print("Sentence : ",' '.join(sent))
    
    print("Unigram Prob : ", probs_uni[i]," Perplexity : ",probs_uni[i]**(-1/(len(sent))))
    print("Bigram Prob : ", probs_bi[i]," Perplexity : ",(probs_bi[i]+1e-200)**(-1/(len(sent))))
    print("Trigram Prob : ", probs_tri[i]," Perplexity : ",(probs_tri[i]+1e-200)**(-1/(len(sent))))
    print("---------------------------------------------------------")
    

In [None]:
k_values = [0.0001,0.001,0.01,0.1,1.0]

In [None]:
def get_corpus_additive_smooth_prob(data,model,model2=None):
    res = {}
    for k in k_values:
        if model2 is None:
            res[k] = [model.get_addtive_smooth_prob(x,k) for x in data]
        else:
            res[k] = [model.get_addtive_smooth_prob(x,model2,k) for x in data]
    return res

In [None]:
smooth_uni = get_corpus_additive_smooth_prob(test_data,model_uni)
smooth_bi = get_corpus_additive_smooth_prob(test_data,model_bi,model_uni)
smooth_tri = get_corpus_additive_smooth_prob(test_data,model_tri,model_bi)

In [None]:
print("Additive Smoothing")
for k in k_values:
    print("###############################################################")
    print("K = ",k)
    for i,sent in enumerate(test_data):
        print("Sentence : ",' '.join(sent))
        print("Unigram Prob : ", smooth_uni[k][i],"Perplexity : ",smooth_uni[k][i]**(-1/(len(sent))))
        print("Bigram Prob : ", smooth_bi[k][i],"Perplexity : ",(smooth_bi[k][i]+1e-200)**(-1/(len(sent))))
        print("Trigram Prob : ", smooth_tri[k][i],"Perplexity : ",(smooth_bi[k][i]+1e-200)**(-1/(len(sent))))
        print("---------------------------------------------------------")

In [None]:
def get_good_turing_prob(data,model,model2=None):
    if model2 is None:
        return [model.good_turing_prob(x) for x in data]
    else:
        return [model.good_turing_prob(x,model2) for x in data]

In [None]:
good_bi = get_good_turing_prob(test_data,model_bi,model_uni)
good_tri = get_good_turing_prob(test_data,model_tri,model_bi)

In [None]:
print("Good Turing Smoothing")
for i,sent in enumerate(test_data):
    print("Sentence : ",' '.join(sent))
    print("Bigram Prob : ", good_bi[i]," Perplexity : ",(good_bi[i]+1e-200)**(-1/(len(sent))))
    print("Trigram Prob : ", good_tri[i]," Perplexity : ",(good_tri[i]+1e-200)**(-1/(len(sent))))
    print("---------------------------------------------------------")

In [None]:
l_values = [0.2,0.5,0.8]

In [None]:
bigram_interpol_probs = {}
for l in l_values:
    bigram_interpol_probs[l] = [model_bi.interpolation_prob(x,model_uni,l) for x in test_data]

In [None]:
bigram_interpol_probs
print("Interpolation")
for l in l_values:
    print("###############################################################")
    print("lambda = ",l)
    for i,sent in enumerate(test_data):
        print("Sentence : ",' '.join(sent))
        print("Bigram Prob : ", bigram_interpol_probs[l][i],"Perplexity : ",bigram_interpol_probs[l][i]**(-1/(len(sent))))
        print("---------------------------------------------------------")