In [52]:
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import brown
from nltk.corpus import gutenberg
from sklearn.model_selection import train_test_split
import itertools
import string
import math

In [53]:
START = '<s>'
STOP = '</s>'
UNK = '<unk>'
l1 = 0.6
l2 = 0.2
l3 = 0.2
punc = list(string.punctuation)

In [54]:
def preprocess_Train_data(data):
    for line in data :
        line.insert(0,START)
        line.insert(0,START)
        line.append(STOP)
    words = list(itertools.chain.from_iterable(data))
    data_words = [x.lower() for x in words]
    data_words.append(UNK)
    freq = nltk.FreqDist(data_words)
    unknown = list()
    for word in freq.keys():
        if freq[word] == 1:
            unknown.append(word)
    unknown = set(unknown)
    vocab = list()
    for w in data_words :
        if w in unknown :
            vocab.append(UNK)
        else :
            vocab.append(w)
    data_words = vocab
    return data , words, data_words ,unknown

In [55]:
def create_bigram_table( data_words ):
    cfreq_2gram = nltk.ConditionalFreqDist(nltk.bigrams(data_words))
    cprob_2gram = nltk.ConditionalProbDist(cfreq_2gram, nltk.MLEProbDist)
    return cfreq_2gram,cprob_2gram

In [56]:
def create_trigram_table( data_words ):
    trigrams = nltk.trigrams(data_words)
    sen = list()
    sen = [((a,b),c) for (a,b,c) in trigrams]
    cfreq_3gram = nltk.ConditionalFreqDist(sen)
    cprob_3gram = nltk.ConditionalProbDist(cfreq_3gram, nltk.MLEProbDist)
    return cfreq_3gram,cprob_3gram    

In [102]:
def unigram_prob ( word, freq_1gram,len_):
    return freq_1gram[ word] / float(len_)

In [58]:
def katz ( w1 , w2 , w3 , cprob_3gram , cprob_2gram ,freq_1gram,len_):
        if cprob_3gram[(w1,w2)].prob(w3) > 0:
            #print 3 
            return cprob_3gram[(w1,w2)].prob(w3)
        else:
            if cprob_2gram[w2].prob(w3) > 0:
                #print 2
                return cprob_2gram[w2].prob(w3)
            else :
                #print 1
                return unigram_prob(w3,freq_1gram,len_)

In [59]:
def interpolate( w1 , w2 , w3 , cprob_3gram , cprob_2gram ,data_words):
    x = cprob_3gram[(w1,w2)].prob(w3)
    y = cprob_2gram[w2].prob(w3)
    z = unigram_prob(w3 ,data_words)
    return l1*x + l2*y + l3*z


In [60]:
def preprocess_test(Test,vocab_set):
    test = []
    for line in Test :
        t = []
        for word in line :
            if word not in vocab_set :
                t.append(UNK)
            else :
                t.append(word)
        #t = [i.lower() for i in line if i not in punc]
        t.append(STOP)
        t.insert(0,START)
        t.insert(0,START)
        test.append(t)
    return test

In [77]:
def evaluate_perplexity(test,cprob_3gram,cprob_2gram,freq_1gram,len_):
    perp = 0
    n = 0
    for line in test :
        for i in range(2,len(line)):
            val = katz(line[i-2],line[i-1],line[i],cprob_3gram,cprob_2gram,freq_1gram,len_)
            #val = interpolate(line[i-2],line[i-1],line[i],cprob_3gram,cprob_2gram,data_words)
            perp += math.log(val,2)
        n += len(line)
    perp = (-1) * (perp / float(n))
    return  2**perp

In [78]:
def create_model(Train , Test):
    Train, words, Train_words, unknown = preprocess_Train_data ( Train)
    print "step1"
    vocab_set = set( Train_words)
    print "step2"
    freq_1gram = nltk.FreqDist(Train_words)
    len_ = len(Train_words)
    print "step3"
    cfreq_2gram, cprob_2gram = create_bigram_table ( Train_words)
    print "step3"
    cfreq_3gram, cprob_3gram = create_trigram_table ( Train_words)
    print "step4"
    test = preprocess_test ( Test,vocab_set)
    print "step5"
    return cprob_3gram,cprob_2gram,freq_1gram,len_,test 


In [99]:
def try_dataset(dataset):
    if dataset == "brown":
        brown_sent = brown.sents()
        D1_Train,D1_Test = train_test_split(brown_sent,train_size = 0.8 , random_state = 7)    
        return create_model(D1_Train,D1_Test)

    else :
        if dataset == "gutenberg":
            gut_sent = gutenberg.sents()
            D2_Train,D2_Test = train_test_split(gut_sent,train_size = 0.8 , random_state = 7)
            return create_model(D2_Train,D2_Test)
        else :
            brown_sent = brown.sents()
            gut_sent = gutenberg.sents()
            sent = brown_sent + gut_sent
            D3_Train,D3_Test = train_test_split(sent,train_size = 0.8 , random_state = 7)
            return create_model(D3_Train,D3_Test)
    

In [100]:
print("Perplexity on Brown dataset")
bcprob_3gram,bcprob_2gram,bfreq_1gram,blen_,btest = try_dataset("brown")

Perplexity on Brown dataset
step1
step2
step3
step3
step4
step5


In [103]:
perplexity = evaluate_perplexity (btest ,bcprob_3gram, bcprob_2gram, bfreq_1gram,blen_)
print perplexity

79.191050418


In [104]:
print("Perplexity on Gutenberg dataset")
gcprob_3gram,gcprob_2gram,gfreq_1gram,glen_,gtest = try_dataset("gutenberg")

Perplexity on Gutenberg dataset
step1
step2
step3
step3
step4
step5


In [105]:
perplexity = evaluate_perplexity (gtest ,gcprob_3gram, gcprob_2gram, gfreq_1gram,glen_)
print perplexity

63.2924523313


In [106]:
print("Perplexity on Brown+Gutenberg dataset")
mcprob_3gram,mcprob_2gram,mfreq_1gram,mlen_,mtest = try_dataset("both")

Perplexity on Brown+Gutenberg dataset
step1
step2
step3
step3
step4
step5


In [107]:
perplexity = evaluate_perplexity (mtest ,mcprob_3gram, mcprob_2gram, mfreq_1gram,mlen_)
print perplexity

75.3793147537
