# N-gram Language Model

## Bigram Model

In [None]:
import re
import random

In [None]:
path = '/content/drive/MyDrive/NLP/CA1_Codes/data/Tarzan.txt'
train_file = open(path, "r")
train_text = train_file.read()

In [None]:
def word_tokenizer(text):
  pattern = r'[^\w\s]|\b[\w\S]+\b'
  tokens = re.findall(pattern, text)
  return tokens

In [None]:
train_text = train_text.lower()
corpus = word_tokenizer(train_text)
corpus[:20]

['\ufeff',
 'the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'tarzan',
 ',',
 'lord',
 'of',
 'the',
 'jungle',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone']

In [None]:
vocab = set(corpus)

In [None]:
print(f'Size of corpus: {len(corpus)}')
print(f'Size of vocabulary: {len(vocab)}')

Size of corpus: 85163
Size of vocabulary: 6970


In [None]:
word_counts = {}
bigram_counts = {}

for word in vocab:
    word_counts[word] = corpus.count(word)

for i in range(len(corpus)-1):
    bigram = (corpus[i], corpus[i+1])

    if bigram in bigram_counts.keys():
        bigram_counts[bigram] += 1
    else:
        bigram_counts[bigram] = 1


In [None]:
bigram_counts

{('\ufeff', 'the'): 1,
 ('the', 'project'): 33,
 ('project', 'gutenberg'): 87,
 ('gutenberg', 'ebook'): 3,
 ('ebook', 'of'): 1,
 ('of', 'tarzan'): 26,
 ('tarzan', ','): 47,
 (',', 'lord'): 10,
 ('lord', 'of'): 11,
 ('of', 'the'): 965,
 ('the', 'jungle'): 94,
 ('jungle', 'this'): 1,
 ('this', 'ebook'): 8,
 ('ebook', 'is'): 2,
 ('is', 'for'): 3,
 ('for', 'the'): 96,
 ('the', 'use'): 6,
 ('use', 'of'): 10,
 ('of', 'anyone'): 2,
 ('anyone', 'anywhere'): 2,
 ('anywhere', 'in'): 2,
 ('in', 'the'): 371,
 ('the', 'united'): 15,
 ('united', 'states'): 16,
 ('states', 'and'): 3,
 ('and', 'most'): 3,
 ('most', 'other'): 2,
 ('other', 'parts'): 2,
 ('parts', 'of'): 3,
 ('the', 'world'): 15,
 ('world', 'at'): 2,
 ('at', 'no'): 4,
 ('no', 'cost'): 2,
 ('cost', 'and'): 2,
 ('and', 'with'): 21,
 ('with', 'almost'): 4,
 ('almost', 'no'): 2,
 ('no', 'restrictions'): 2,
 ('restrictions', 'whatsoever'): 2,
 ('whatsoever', '.'): 2,
 ('.', 'you'): 36,
 ('you', 'may'): 18,
 ('may', 'copy'): 2,
 ('copy', 'it'

In [None]:
bigram_prob = {}
V = len(vocab)

for key in bigram_counts:
    bigram_prob[key] = (bigram_counts[key] + 1)/(word_counts[key[0]] + V)

In [None]:
def complete_sentence_by_word(word):
    V = len(vocab)

    vocab_probilities = {}
    for v in vocab:
        bigram = (word, v)

        if bigram in bigram_prob.keys():
            vocab_probilities[v] = bigram_prob[bigram]
        else:
            vocab_probilities[v] = 1/(word_counts[word] + V)

    top_suggestions = sorted(vocab_probilities.items(), key=lambda x: x[1], reverse=True)[:3]
    i = random.randint(0, 2)
    return top_suggestions[i][0]

def complete_sentence_by_number(text, n):
    tokenized = word_tokenizer(text.lower())
    word = tokenized[-1]

    for i in range(n):
        word = complete_sentence_by_word(word)
        text += ' '+word
        print(text)

In [None]:
complete_sentence_by_number('Knowing well the windings of the trail he', 10)

Knowing well the windings of the trail he had
Knowing well the windings of the trail he had not
Knowing well the windings of the trail he had not the
Knowing well the windings of the trail he had not the great
Knowing well the windings of the trail he had not the great tourney
Knowing well the windings of the trail he had not the great tourney ,
Knowing well the windings of the trail he had not the great tourney , and
Knowing well the windings of the trail he had not the great tourney , and his
Knowing well the windings of the trail he had not the great tourney , and his own
Knowing well the windings of the trail he had not the great tourney , and his own people


In [None]:
complete_sentence_by_number('For half a day he lolled on the huge back and', 10)

For half a day he lolled on the huge back and the
For half a day he lolled on the huge back and the ape-man
For half a day he lolled on the huge back and the ape-man ,
For half a day he lolled on the huge back and the ape-man , but
For half a day he lolled on the huge back and the ape-man , but a
For half a day he lolled on the huge back and the ape-man , but a few
For half a day he lolled on the huge back and the ape-man , but a few moments
For half a day he lolled on the huge back and the ape-man , but a few moments the
For half a day he lolled on the huge back and the ape-man , but a few moments the great
For half a day he lolled on the huge back and the ape-man , but a few moments the great ,


## Trigram Model

In [None]:
trigram_counts = {}

for i in range(len(corpus)-2):
    trigram = (corpus[i], corpus[i+1], corpus[i+2])

    if trigram in trigram_counts.keys():
        trigram_counts[trigram] += 1
    else:
        trigram_counts[trigram] = 1

In [None]:
trigram_prob = {}
V = len(vocab)

for key in trigram_counts:
    trigram_prob[key] = (trigram_counts[key] + 1)/(bigram_counts[(key[0], key[1])] + V)

In [None]:
def trigram_complete_sentence_by_word(words):
    V = len(vocab)

    vocab_probilities = {}
    for v in vocab:
        trigram = (words[0], words[1], v)
        bigram = (words[0], words[1])

        if trigram in trigram_prob.keys():
            vocab_probilities[v] = trigram_prob[trigram]
        elif bigram in bigram_counts.keys():
            vocab_probilities[v] = 1/(bigram_counts[bigram] + V)
        else:
            vocab_probilities[v] = 1/(V)

    top_suggestions = sorted(vocab_probilities.items(), key=lambda x: x[1], reverse=True)[:4]
    i = random.randint(0, 3)
    return top_suggestions[i][0]

def trigram_complete_sentence_by_number(text, n):
    tokenized = word_tokenizer(text.lower())
    word0 = tokenized[-2]
    word1 = tokenized[-1]

    for i in range(n):
        temp = word1
        word1 = trigram_complete_sentence_by_word([word0, word1])
        text += ' '+word1
        word0 = temp
        print(text)

In [None]:
trigram_complete_sentence_by_number('Knowing well the windings of the trail he', 10)

Knowing well the windings of the trail he took
Knowing well the windings of the trail he took himself
Knowing well the windings of the trail he took himself factions
Knowing well the windings of the trail he took himself factions panting
Knowing well the windings of the trail he took himself factions panting panting
Knowing well the windings of the trail he took himself factions panting panting factions
Knowing well the windings of the trail he took himself factions panting panting factions cast
Knowing well the windings of the trail he took himself factions panting panting factions cast belongings
Knowing well the windings of the trail he took himself factions panting panting factions cast belongings panting
Knowing well the windings of the trail he took himself factions panting panting factions cast belongings panting panting


In [None]:
trigram_complete_sentence_by_number('For half a day he lolled on the huge back and', 10)

For half a day he lolled on the huge back and forth
For half a day he lolled on the huge back and forth panting
For half a day he lolled on the huge back and forth panting panting
For half a day he lolled on the huge back and forth panting panting cast
For half a day he lolled on the huge back and forth panting panting cast cast
For half a day he lolled on the huge back and forth panting panting cast cast belongings
For half a day he lolled on the huge back and forth panting panting cast cast belongings belongings
For half a day he lolled on the huge back and forth panting panting cast cast belongings belongings panting
For half a day he lolled on the huge back and forth panting panting cast cast belongings belongings panting belongings
For half a day he lolled on the huge back and forth panting panting cast cast belongings belongings panting belongings belongings


## 5-gram

In [None]:
fourgram_counts = {}
fivegram_counts = {}

for i in range(len(corpus)-4):
    fourgram = (corpus[i], corpus[i+1], corpus[i+2], corpus[i+3])
    fivegram = (corpus[i], corpus[i+1], corpus[i+2], corpus[i+3] , corpus[i+4])

    if fourgram in fourgram_counts.keys():
        fourgram_counts[fourgram] += 1
    else:
        fourgram_counts[fourgram] = 1

    if fivegram in fivegram_counts.keys():
        fivegram_counts[fivegram] += 1
    else:
        fivegram_counts[fivegram] = 1

In [None]:
fivegram_prob = {}
V = len(vocab)

for key in fivegram_counts:
    fivegram_prob[key] = (fivegram_counts[key] + 1)/(fourgram_counts[(key[0], key[1], key[2], key[3])] + V)

In [None]:
def fivegram_complete_sentence_by_word(words):
    V = len(vocab)

    vocab_probilities = {}
    for v in vocab:
        fivegram = (words[0], words[1], words[2], words[3], v)
        fourgram = (words[0], words[1], words[2], words[3])

        if fivegram in fivegram_prob.keys():
            vocab_probilities[v] = fivegram_prob[fivegram]
        elif fourgram in fourgram_counts.keys():
            vocab_probilities[v] = 1/(fourgram_counts[fourgram] + V)
        else:
            vocab_probilities[v] = 1/(V)

    top_suggestions = sorted(vocab_probilities.items(), key=lambda x: x[1], reverse=True)[:6]
    i = random.randint(0, 5)
    return top_suggestions[i][0]

def fivegram_complete_sentence_by_number(text, n):
    tokenized = word_tokenizer(text.lower())
    words = tokenized[-4:]

    for i in range(n):
        new_word = fivegram_complete_sentence_by_word(words)
        text += ' '+new_word
        words.pop(0)
        words.append(new_word)
        print(text)

In [None]:
fivegram_complete_sentence_by_number('Knowing well the windings of the trail he', 10)

Knowing well the windings of the trail he took
Knowing well the windings of the trail he took short
Knowing well the windings of the trail he took short glanced
Knowing well the windings of the trail he took short glanced history
Knowing well the windings of the trail he took short glanced history glanced
Knowing well the windings of the trail he took short glanced history glanced history
Knowing well the windings of the trail he took short glanced history glanced history history
Knowing well the windings of the trail he took short glanced history glanced history history belongings
Knowing well the windings of the trail he took short glanced history glanced history history belongings glanced
Knowing well the windings of the trail he took short glanced history glanced history history belongings glanced history


In [None]:
fivegram_complete_sentence_by_number('For half a day he lolled on the huge back and', 10)

For half a day he lolled on the huge back and panting
For half a day he lolled on the huge back and panting panting
For half a day he lolled on the huge back and panting panting cast
For half a day he lolled on the huge back and panting panting cast glanced
For half a day he lolled on the huge back and panting panting cast glanced factions
For half a day he lolled on the huge back and panting panting cast glanced factions factions
For half a day he lolled on the huge back and panting panting cast glanced factions factions factions
For half a day he lolled on the huge back and panting panting cast glanced factions factions factions factions
For half a day he lolled on the huge back and panting panting cast glanced factions factions factions factions glanced
For half a day he lolled on the huge back and panting panting cast glanced factions factions factions factions glanced cast
