# Language models using Ngrams


### *Brungesh BE*

Here we will explore language models.
To do so, we will encapsulate a language model using a class called `language_model` which implements language models with Laplace add-one smoothing.

In [6]:
import numpy as np
from collections import Counter, defaultdict

class language_model:

    def __init__(self, ngram=1):

        """
        Initialize a language model

        Parameters:
        ngram specifies the type of model:  
        unigram (ngram = 1), bigram (ngram = 2) etc.
        """
        self.ngram = ngram

    def unigram(self, text):
        
        """Calculate the unigram counts for the file 
        and store them in a dictionary"""
        
        data = {}
        words = [word for sentence in text for word in sentence.split()]
        word_length = len(set(words))
        self.c = Counter(words)

        for word in words:
            data[word] = (self.c[word] / word_length)
        return data, word_length

    def text_clean(self, text):
        
        """ Clean the text for special characters and add begining and 
        ending sentence tokens to each sentence"""
        
        text = text.strip('\n').lower()
        text = text.replace('\n', ' ')

        text = text.translate ({ord(c): "." for c in "!:?"})
        text = text.translate ({ord(c): "" for c in "\"\''""@#$%^&*()[]{};,/<>\|`~=_+"})
        text = text.translate ({ord(c): " " for c in "-"})

        sentences = ['<s> ' +" ".join(sentence.split())+ ' </s>' for sentence in text.split('.') if sentence != ""]
        return sentences

    def ngram_generation(self, sentence, n):
        
        """Generate ngrams for each sentence"""
        
        tokens = [token for token in sentence.split(" ") if token != ""]
        ngrams = zip(*[tokens[i:] for i in range(n)])
        ngrams = [" ".join(ngram) for ngram in ngrams]
        ngrams = [tuple(sent.split()) for sent in ngrams]
        return ngrams

    def bigram(self, text):
        
        """Calculate the bigram counts for the file 
        and store them in a model"""
        
        model = defaultdict(lambda: defaultdict(lambda: 0))
        for sentence in text:
            for w1, w2 in self.ngram_generation(sentence, 2):
                model[(w1)][w2] += 1
        for w1 in model:
            total_count = self.c[w1]
            for w2 in model[w1]:
                model[w1][w2] /= total_count
        return model

    def trigram(self, text):
        
        """Calculate the trigram counts for the file 
        and store them in a model"""
        
        model = defaultdict(lambda: defaultdict(lambda: 0))
        for sentence in text:
            for w1, w2, w3 in self.ngram_generation(sentence, 3):
                model[(w1, w2)][w3] += 1
        for w1_w2 in model:
            total_count = self.bi_data[w1_w2]
            for w3 in model[w1_w2]:
                model[w1_w2][w3] /= total_count
        return model
    
    def perplexity(self, data):
        
        """Calculate perplexity for the corpus"""
        
        exp = [np.log2(v) for k,v in data.items()]
        exp = sum(exp) / len(exp)
        return np.power(2, -exp)


    def train(self, file_name) :
        """
        train a language model

        Parameters:
        file_name is a file that contains the training set for the model
        """
        with open(file_name, 'r') as f:
            text = f.read()
        clean_text = self.text_clean(text)

        if self.ngram == 1:
            self.uni_data, self.uni_count = self.unigram(clean_text)
        if self.ngram == 2:
            self.uni_data, self.uni_count = self.unigram(clean_text)
            bi_model = dict(self.bigram(clean_text))
            self.bi_data = {(i,j): bi_model[i][j] 
                                       for i in bi_model.keys() 
                                       for j in bi_model[i].keys()}
            self.bi_data_count = len(self.bi_data)

        if self.ngram == 3:
            self.uni_data, self.uni_count = self.unigram(clean_text)
            bi_model = dict(self.bigram(clean_text))
            self.bi_data = {(i,j): bi_model[i][j] 
                                       for i in bi_model.keys() 
                                       for j in bi_model[i].keys()}
            self.bi_data_count = len(self.bi_data)
            tri_model = dict(self.trigram(clean_text))
            self.tri_data = {(i,j): tri_model[i][j] 
                                       for i in tri_model.keys() 
                                       for j in tri_model[i].keys()}
            new_dict = {}
            for k,v in self.tri_data.items():
                tup = []
                tup = [tupl for tupl in k[0]]
                tup.append(k[1])
                new_dict[tuple(tup)] = v
            self.tri_data = new_dict
            self.tri_data_count = len(self.tri_data)
        pass
    

    def test(self, file_name) :
        """
        Test a language model on a given text and return the perplexity 
        of a trained model on the text provided as input

        Parameters:
        file_name is a file that contains the test set on which the 
        model needs to be evaluated 
        """
        with open(file_name, 'r') as f:
            test_text = f.read()
        clean_test_text = self.text_clean(test_text)
        test_dict = {}
        self.zero_count = 0
        self.sparsity = 0

        if self.ngram == 1:
            for sentence in clean_test_text:
                for word in sentence.split():
                    if word not in self.uni_data.keys():
                        test_dict[word] = 1/self.uni_count
                        self.zero_count += 1
                    else:
                        test_dict[word] = self.uni_data[word]
            if self.zero_count:
                self.sparsity = self.zero_count/len(test_dict)

        if self.ngram == 2:
            for sentence in clean_test_text:
                for word in self.ngram_generation(sentence, self.ngram):
                    if word not in self.bi_data.keys():
                        test_dict[word] = 1/self.bi_data_count
                        self.zero_count += 1
                    else:
                        test_dict[word] = self.bi_data[word]
            if self.zero_count:
                self.sparsity = self.zero_count/len(test_dict)
                        
        if self.ngram == 3:
            for sentence in clean_test_text:
                for word in self.ngram_generation(sentence, self.ngram):
                    if word not in self.tri_data.keys():
                        test_dict[word] = 1/self.tri_data_count
                        self.zero_count += 1
                    else:
                        test_dict[word] = self.tri_data[word]
            if self.zero_count:
                self.sparsity = self.zero_count/len(test_dict)

        return self.perplexity(test_dict)

### Unigram

In [10]:
lm = language_model(1)
lm.train("pride_and_prejudice.txt")
lm.test("persuasion.txt")

2080.9480738645634

### Bigram

In [11]:
lm = language_model(2)
lm.train("pride_and_prejudice.txt")
lm.test("persuasion.txt")

6168.125846028626

### Trigram

In [12]:
lm = language_model(3)
lm.train("pride_and_prejudice.txt")
lm.test("persuasion.txt")

13876.31400073753

### Observations

From the above results we can say that unigram, bigram and trigram perform differently for the same set of train and test data. As seen, the perplexity increases as ngram increases. Since lower perpelxity results in better model performance we can say that the unigram has the best performance. But it also depends on the data points distribution in the dataset. If we train for a different dataset the results may vary.

### Tabulations of results

In [7]:
import pandas as pd

result = []
for n in range(1,4):
    l = language_model(n)
    l.train("pride_and_prejudice.txt")
    result.append([l.ngram, "Jane Aysten", "Pride and Prejudice", l.test("pride_and_prejudice.txt"), l.sparsity, l.zero_count])
    result.append([l.ngram, "Jane Aysten", "Persuasion", l.test("persuasion.txt"), l.sparsity, l.zero_count])
    result.append([l.ngram, "Jane Aysten", "Sense and Sensibility", l.test("sense_and_sensibility.txt"), l.sparsity, l.zero_count])
    result.append([l.ngram, "Charlotte Bronte", "Jane Eyre", l.test("jane_eyre.txt"), l.sparsity, l.zero_count])
    result.append([l.ngram, "Lewis Carroll", "Alice in Wonderland", l.test("alice_in_wonderland.txt"), l.sparsity, l.zero_count])
result

df = pd.DataFrame(result, columns = ['ngram', 'Author', 'Book', 'Perplexity', 'Sparsity', 'Zero counts'])
df

Unnamed: 0,ngram,Author,Book,Perplexity,Sparsity,Zero counts
0,1,Jane Aysten,Pride and Prejudice,1946.746583,0.0,0
1,1,Jane Aysten,Persuasion,2080.948074,0.840722,4893
2,1,Jane Aysten,Sense and Sensibility,2196.610014,1.024134,6535
3,1,Charlotte Bronte,Jane Eyre,3727.991643,1.715938,21662
4,1,Lewis Carroll,Alice in Wonderland,1064.934993,0.76074,1151
5,2,Jane Aysten,Pride and Prejudice,57.544191,0.0,0
6,2,Jane Aysten,Persuasion,6168.125846,0.834767,34268
7,2,Jane Aysten,Sense and Sensibility,6347.809378,0.87478,46212
8,2,Charlotte Bronte,Jane Eyre,16524.004566,1.083011,97849
9,2,Lewis Carroll,Alice in Wonderland,4305.63434,0.787002,4977


The Language model has been trained with the prisde and prejudice dataset and tested for the rest of the books written by Jane as well as other contemporaries. The sparsity of the bigram and trigram models depends on how many data points we have for the particular dataset. The higher the ngram higher is the sparsity of the model. The fraction of zero counts in the dataset before smoothing can be observed in the table. Testing on different authors gives more zero counts since there's a drastic change in the style of contemporary writings of different authors.