In [1]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pandas as pd
import nltk
import string
import numpy as np
import copy

In [2]:
data=pd.read_csv('train_dataset.csv')

In [3]:
comments='Comment'

In [4]:
def tokenize_sent(data):
    sentences=sent_tokenize(data)
    return sentences

def tokenize_word(data):
    words=word_tokenize(data)
    return words

In [5]:
data[comments]=data[comments].apply(tokenize_sent)

In [6]:
unigram_data=data[comments]
sentences=[]
translator = str.maketrans('', '', string.punctuation)
for data in unigram_data:
    for sent in data:
        new_sent=sent.lower()
        sentences.append(new_sent.translate(translator))

In [7]:
unigram_sent=sentences
bigram_sent=[]
trigram_sent=[]
quadgram_sent=[]

In [8]:
for sent in sentences:
    bigram_sent.append(sent)
    trigram_sent.append(sent)
    quadgram_sent.append(sent)

In [9]:
unigram_words=[]
for sent in unigram_sent:
    words=word_tokenize(sent)
    unigram_words.append(words)


In [10]:
bigram_words=unigram_words.copy()
trigram_words=unigram_words.copy()
quadgram_words = unigram_words.copy()

for sent in bigram_words:
    sent.append('</s>')
    sent.insert(0,'<s>')

In [11]:
for sent in trigram_words:
    sent.append('</s>')
    sent.append('</s>')
    sent.insert(0,'<s>')
    sent.insert(0,'<s>')

for sent in quadgram_words:
    sent.append('</s>')
    sent.append('</s>')
    sent.append('</s>')
    sent.insert(0,'<s>')
    sent.insert(0,'<s>')
    sent.insert(0,'<s>')

In [104]:
import numpy as np
start_sent='<s>'
end_sent='</s>'

class unigram_model():
  def __init__(self,sentences):
    self.unigram_frequencies=dict()
    self.vocabulary=set()
    self.corpus_size=0
    for sentence in sentences:
      for word in sentence:
        self.unigram_frequencies[word]=self.unigram_frequencies.get(word,0)+1
        if word!=start_sent or  word!=end_sent:  
         self.corpus_size+=1
        if word not in self.vocabulary:
         self.vocabulary.add(word)
    self.vocab_size=len(self.unigram_frequencies)-2 #Not including start and end of sentence in vocabulary

  def calculate_probability(self,word):
    if word not in self.vocabulary:
      return 0
    return float(self.unigram_frequencies[word])/float(self.corpus_size)

  def calculate_perplexity(self,word):
    prob_word=self.calculate_probability(word)
    if(prob_word==0):
      return 10**(7)
    perplexity=np.log2(float(1/prob_word))*(float(1/self.corpus_size))
    return perplexity


class bigram_model(unigram_model):
  def __init__(self,sentences):
    unigram_model.__init__(self,sentences)
    self.bigram_frequencies=dict()
    self.total_bigrams=0
    for sentence in sentences:
      prev_word=sentence[0]
      for word in sentence[1:]:
        self.bigram_frequencies[(prev_word,word)]=self.bigram_frequencies.get((prev_word,word),0)+1
        self.total_bigrams+=1
        prev_word=word

    self.total_bigram_words=len(self.bigram_frequencies)

  def calculate_probability(self,prev_word,word):
    a=self.bigram_frequencies.get((prev_word,word),0)
    b=self.unigram_frequencies.get(prev_word,0)

    if b==0:
      return 0
    return (float(a)/float(b))*unigram_model.calculate_probability(self,prev_word)

  def calculate_perplexity(self,prev_word,word):
    prob_word=self.calculate_probability(prev_word,word)
    if(prob_word==0):
      return 10**(7)
    perplexity=np.log2(float((1/prob_word)))*(float(1/self.total_bigrams))
    return perplexity

class trigram_model(bigram_model):
    def __init__(self, sentences):
        bigram_model.__init__(self, sentences)
        self.trigram_frequencies = {}
        self.total_trigrams=0
        for sentence in sentences:
            prev_word1 = sentence[0]
            prev_word2 = sentence[1]
            for word in sentence[2:]:
              self.trigram_frequencies[(prev_word1, prev_word2, word)] = self.trigram_frequencies.get((prev_word1, prev_word2, word), 0) + 1
              prev_word1 = prev_word2
              prev_word2 = word
              self.total_trigrams+=1

        self.total_trigram_words = len(self.trigram_frequencies)

    def calculate_probability(self, prev_word1, prev_word2, word):
        trigram_frequency = self.trigram_frequencies.get((prev_word1, prev_word2, word), 0)
        bigram_frequency = self.bigram_frequencies.get((prev_word2, word), 0)

        if bigram_frequency == 0:
            return 0
        return (float(trigram_frequency) / float(bigram_frequency))*bigram_model.calculate_probability(self,prev_word1,prev_word2)

    def calculate_perplexity(self,prev_word1, prev_word2, word):
      prob_word=self.calculate_probability(prev_word1, prev_word2, word)
      if(prob_word==0):
        return 10**(7) #float('inf')
      perplexity=np.log2(float((1/prob_word)))*(float(1/self.total_trigrams))
      return perplexity


class quadgram_model(trigram_model):
    def __init__(self, sentences):
        trigram_model.__init__(self, sentences)
        self.quadgram_frequencies = {}
        self.total_quadgrams=0
        for sentence in sentences:
            prev_word1 = sentence[0]
            prev_word2 = sentence[1]
            prev_word3 = sentence[2]
            for word in sentence[3:]:
                quadgram = (prev_word1,prev_word2,prev_word3,word)
                self.quadgram_frequencies[quadgram] = self.quadgram_frequencies.get(quadgram, 0) + 1
                prev_word1 = prev_word2
                prev_word2 = prev_word3
                prev_word3 = word
                self.total_quadgrams+=1

        self.total_quadgram_words = len(self.quadgram_frequencies)


    def calculate_probability(self, prev_word1, prev_word2, prev_word3, word):
        quadgram_frequency = self.quadgram_frequencies.get((prev_word1, prev_word2, prev_word3, word), 0)
        trigram_frequency = self.trigram_frequencies.get((prev_word1, prev_word2, prev_word3), 0)

        if trigram_frequency == 0:
            return 0
        return (float(quadgram_frequency) / float(trigram_frequency))*trigram_model.calculate_probability(self,prev_word1,prev_word2,prev_word3)
    
    def calculate_perplexity(self,prev_word1, prev_word2, prev_word3, word):
      prob_word=self.calculate_probability(prev_word1, prev_word2, prev_word3, word)
      if(prob_word==0):
        return 10**(7)
      perplexity=np.log2(float((1/prob_word)))*(float(1/self.total_quadgrams))
      return perplexity

In [105]:
test_data=pd.read_csv('test_dataset.csv')
test_comments='Comment'

test_data[test_comments]=test_data[test_comments].apply(tokenize_sent)
test_unigram_data=test_data[test_comments]
test_sentences=[]
for data in test_unigram_data:
    for sent in data:
        new_sent=sent.lower()
        test_sentences.append(new_sent.translate(translator))

test_unigram_sent=test_sentences
test_unigram_words=[]
for sent in test_unigram_sent:
    words=word_tokenize(sent)
    test_unigram_words.append(words)

In [106]:
UNIGRAM_MODEL=unigram_model(unigram_words)

unigram_perplex=0
count=0
for data in test_unigram_words:
    for word in data:
        count+=1
        unigram_perplex+=UNIGRAM_MODEL.calculate_perplexity(word)
avg_perplex=unigram_perplex/count

In [107]:
avg_perplex

145340.5528644213

In [108]:
BIGRAM_MODEL=bigram_model(bigram_words)
bigram_perplex=0
count=0
for data in test_unigram_words:
    for i in range(len(data)-1):
        count+=1
        bigram_perplex+=BIGRAM_MODEL.calculate_perplexity(data[i],data[i+1])
Avg_perplex=bigram_perplex/count
Avg_perplex

1758750.3513784565

In [109]:
class N_Gram(unigram_model):
    def __init__(self,sentences,n):
        if(n==1):
            self.model=unigram_model(sentences)
        if(n==2):
            self.model=bigram_model(sentences)
        if(n==3):
            self.model=trigram_model(sentences)
        if(n==4):
            self.model=quadgram_model(sentences)

In [110]:
test_uni_gram_model=N_Gram(unigram_words,1)
test_bi_gram_model=N_Gram(bigram_words,2)
test_tri_gram_model=N_Gram(trigram_words,3)
test_quad_gram_model=N_Gram(quadgram_words,4)

In [111]:
print(test_uni_gram_model.model.calculate_probability('the'))
print(test_bi_gram_model.model.calculate_probability('is','the'))
print(test_tri_gram_model.model.calculate_probability('seems','to','me'))
print(test_quad_gram_model.model.calculate_probability('seems','to','me','that'))

0.021763939748329936
0.0005819327752296191
1.8842512452678341e-06
8.696544208928466e-07
