In [None]:
'''
N gram language model

Implemenation:

1.Take any text data (one sentence) as input. Generate unigram, bigram
and Trigram (without nltk)
2. Do the above process with nltk and built in ngram model
3. Consider the following corpus of three sentences
<s> there is a big garden </s>
<s> children play in a garden </s>
<s> they play inside beautiful garden </s>

Create a bigram table for a given data
Using this lookup table calculate the probability of sentences such as
<s>they play in a big garden</s>
Calculate perplexity of
<s> they play in a big garden </s>


We will perform the above operations with and without nltk library

'''

### Without NLtk

In [79]:
def generate_ngrams(words_list, n):
    ngrams_list = []

    for num in range(0, len(words_list)):
        ngram = ' '.join(words_list[num:num + n])
        ngrams_list.append(ngram)

    return ngrams_list

In [80]:
text = 'The quick brown fox jumps over the lazy dog'
words_list = text.split()
unigrams = generate_ngrams(words_list, 1)
bigrams = generate_ngrams(words_list, 2)
trigrams = generate_ngrams(words_list, 3)


print("Unigram: ",unigrams)
print("bigram: ",bigrams)
print("trigram: ",trigrams)


Unigram:  ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
bigram:  ['The quick', 'quick brown', 'brown fox', 'fox jumps', 'jumps over', 'over the', 'the lazy', 'lazy dog', 'dog']
trigram:  ['The quick brown', 'quick brown fox', 'brown fox jumps', 'fox jumps over', 'jumps over the', 'over the lazy', 'the lazy dog', 'lazy dog', 'dog']


### With NLtk

In [81]:
import nltk
from nltk.util import ngrams
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [82]:
sentence = "The quick brown fox jumps over the lazy dog"
words = nltk.word_tokenize(sentence)
print(words)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


In [83]:
unigrams = list(ngrams(words, 1))
print("Unigrams:", unigrams)
bigrams = list(ngrams(words, 2))
print("Bigrams:", bigrams)
trigrams = list(ngrams(words, 3))
print("Trigrams:", trigrams)

Unigrams: [('The',), ('quick',), ('brown',), ('fox',), ('jumps',), ('over',), ('the',), ('lazy',), ('dog',)]
Bigrams: [('The', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'jumps'), ('jumps', 'over'), ('over', 'the'), ('the', 'lazy'), ('lazy', 'dog')]
Trigrams: [('The', 'quick', 'brown'), ('quick', 'brown', 'fox'), ('brown', 'fox', 'jumps'), ('fox', 'jumps', 'over'), ('jumps', 'over', 'the'), ('over', 'the', 'lazy'), ('the', 'lazy', 'dog')]


In [84]:
from collections import defaultdict
import re
import numpy as np
import pandas as pd

In [85]:
corpus="there is a big garden. children play in a garden. they play inside beautiful garden"

In [86]:
corpus=corpus.replace(".","")

In [87]:
words = nltk.word_tokenize(corpus)

In [88]:
print(words)

['there', 'is', 'a', 'big', 'garden', 'children', 'play', 'in', 'a', 'garden', 'they', 'play', 'inside', 'beautiful', 'garden']


In [89]:
bigrams = list(ngrams(words, 2))
print(bigrams)

[('there', 'is'), ('is', 'a'), ('a', 'big'), ('big', 'garden'), ('garden', 'children'), ('children', 'play'), ('play', 'in'), ('in', 'a'), ('a', 'garden'), ('garden', 'they'), ('they', 'play'), ('play', 'inside'), ('inside', 'beautiful'), ('beautiful', 'garden')]


In [90]:
def create_bigram_table(bigrams):
    table = defaultdict(lambda: defaultdict(int))
    for bigram in bigrams:
        if len(bigram) == 2:  # Check if the bigram has exactly two elements
            prev_word, curr_word = bigram
            table[prev_word][curr_word] += 1
        else:
            print(f"Ignored invalid bigram: {bigram}")

    rows = sorted(table.keys())
    columns = sorted({word for words in table.values() for word in words})
    data = [[table[prev_word][curr_word] for curr_word in columns] for prev_word in rows]

    df = pd.DataFrame(data, index=rows, columns=columns)
    return df

bigram_table_df = create_bigram_table(bigrams)
display(bigram_table_df)

Unnamed: 0,a,beautiful,big,children,garden,in,inside,is,play,they
a,0,0,1,0,1,0,0,0,0,0
beautiful,0,0,0,0,1,0,0,0,0,0
big,0,0,0,0,1,0,0,0,0,0
children,0,0,0,0,0,0,0,0,1,0
garden,0,0,0,1,0,0,0,0,0,1
in,1,0,0,0,0,0,0,0,0,0
inside,0,1,0,0,0,0,0,0,0,0
is,1,0,0,0,0,0,0,0,0,0
play,0,0,0,0,0,1,1,0,0,0
there,0,0,0,0,0,0,0,1,0,0


In [91]:
def calculate(sentence, bigram_df):
    words = sentence.split()
    probability = 1.0

    for i in range(len(words) - 1):
        prev_word = words[i]
        curr_word = words[i + 1]

        # Check if both prev_word and curr_word are in the index of the bigram DataFrame
        if prev_word in bigram_df.index and curr_word in bigram_df.columns:
            bigram_count = bigram_df.loc[prev_word, curr_word]
            prev_word_count = sum(bigram_df.loc[prev_word])
            bigram_probability = bigram_count / prev_word_count
            probability *= bigram_probability
        else:
            probability = 0.0
            break

    return probability


In [92]:
import math

In [93]:
sentence="they play in a big garden"

In [94]:
sentence_prob = calculate(sentence, bigram_table_df)
print(f"Probability of '{sentence}': {sentence_prob:.6f}")

Probability of 'they play in a big garden': 0.250000


In [95]:
def calculate_perplexity(sentence_probability, sentence_length):
    return 1 / (sentence_probability ** (1 / sentence_length))

In [96]:
N = len(sentence)
preplexity = calculate_perplexity(sentence_prob,N)
print("Perplexity: ", preplexity)

Perplexity:  1.0570180405613805
