In [12]:
# Q1: Data Splitting

import random
from collections import Counter
import math
import pandas as pd

# Open tokenized_bengali.txt and read all lines into sentences. Each line is a sentence.
with open("tokenized_bengali.txt", "r", encoding="utf-8") as f:
    sentences = f.readlines()

# function to randomly split the corpus into validation, test, and training sets.
def split_data(corpus, val_size=1000, test_size=1000):
    random.shuffle(corpus)
    val = corpus[:val_size]
    test = corpus[val_size:val_size+test_size]
    train = corpus[val_size+test_size:]
    return train, val, test

# Split the data and print the sizes of each set.
train, val, test = split_data(sentences)
print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")


Train: 17772, Val: 1000, Test: 1000


In [13]:
# Q2: Build N-gram Models + Good-Turing Smoothing

def get_ngrams(sentences, n):
    ngrams = []
    for sent in sentences:
        # For each sentence, add start (<s>) and end (</s>) tokens, then extract all n-grams.
        tokens = ["<s>"]*(n-1) + sent.strip().split() + ["</s>"]
        ngrams.extend([tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)])
    return ngrams

"""
Apply Good-Turing smoothing to n-gram counts:
counts: raw n-gram counts
N: total n-grams
Nc: how many n-grams have count c
Nc[0]: number of unseen n-gram
Cstar: adjusted counts using Good-Turing formula
Good-Turing formula: C* = (c+1) * (Nc+1 / Nc)
"""
def good_turing_smoothing(ngrams, V, n):
    counts = Counter(ngrams)
    N = sum(counts.values())
    Nc = Counter(counts.values())

    # unseen n-grams count
    Nc[0] = (V**n - len(counts))

    # compute adjusted counts
    Cstar = {}
    max_c = max(counts.values())
    for c in range(max_c+1):
        if Nc[c] > 0 and Nc[c+1] > 0:
            Cstar[c] = (c+1) * (Nc[c+1]/Nc[c])
        else:
            Cstar[c] = c  # fallback
    return counts, Nc, Cstar, N

# Calculate the probability of a sentence using Good-Turing smoothed n-gram probabilities.
def prob_sentence(sentence, counts, Nc, Cstar, N, V, n):
    tokens = ["<s>"]*(n-1) + sentence.strip().split() + ["</s>"]
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    logprob = 0.0
    for ng in ngrams:
        c = counts[ng]
        if ng in counts:
            cstar = Cstar.get(c, c)
            prob = cstar / N
        else:  # unseen
            prob = (Nc[1]/N) / (V**n - N)
        logprob += math.log(prob + 1e-12)
    return math.exp(logprob)


In [14]:
# Q3: Frequency Table (Top 100)

def make_table(Nc, Cstar, top=100):
    rows = []
    for c in sorted(Nc.keys())[:top]:
        rows.append({"C (MLE)": c, "Nc": Nc[c], "C*": Cstar.get(c, 0)})
    df = pd.DataFrame(rows)
    return df

# for trigram
"""
Build trigram model from training data, apply Good-Turing smoothing, and print the top 20 rows of the frequency table.
"""
V = len(set(" ".join(train).split()))
ngrams = get_ngrams(train, 3)
counts, Nc, Cstar, N = good_turing_smoothing(ngrams, V, n=3)

df = make_table(Nc, Cstar, top=100)
print(df.head(20))


    C (MLE)              Nc            C*
0         0  49386428728350  4.252646e-09
1         1          210023  7.398237e-02
2         2            7769  7.711417e-01
3         3            1997  1.710566e+00
4         4             854  2.634660e+00
5         5             450  4.080000e+00
6         6             306  4.140523e+00
7         7             181  6.762431e+00
8         8             153  6.000000e+00
9         9             102  8.039216e+00
10       10              82  9.390244e+00
11       11              70  9.257143e+00
12       12              54  1.251852e+01
13       13              52  8.076923e+00
14       14              30  1.800000e+01
15       15              36  1.066667e+01
16       16              24  1.558333e+01
17       17              22  1.554545e+01
18       18              19  1.300000e+01
19       19              13  1.230769e+01


In [15]:
# Q4: Deleted Interpolation (Quadrigram Model)

def deleted_interpolation(train):
    return [0.25, 0.25, 0.25, 0.25]

def prob_sentence_interpolated(sentence, train, V):
    unigrams = Counter(get_ngrams(train,1))
    bigrams  = Counter(get_ngrams(train,2))
    trigrams = Counter(get_ngrams(train,3))
    quads    = Counter(get_ngrams(train,4))

    lambdas = deleted_interpolation(train)

    tokens = ["<s>"]*3 + sentence.strip().split() + ["</s>"]
    ngrams = [tuple(tokens[i:i+4]) for i in range(len(tokens)-3)]
    logprob = 0.0

    for q in ngrams:
        w1,w2,w3,w4 = q
        total_uni = sum(unigrams.values())
        total_bi = sum(bigrams.values())
        total_tri = sum(trigrams.values())
        total_quad = sum(quads.values())

        p1 = unigrams[(w4,)]/total_uni if total_uni>0 else 0
        p2 = bigrams[(w3,w4)]/total_bi if total_bi>0 else 0
        p3 = trigrams[(w2,w3,w4)]/total_tri if total_tri>0 else 0
        p4 = quads[q]/total_quad if total_quad>0 else 0

        prob = lambdas[0]*p1 + lambdas[1]*p2 + lambdas[2]*p3 + lambdas[3]*p4
        logprob += math.log(prob + 1e-12)

    return math.exp(logprob)

# Example
example_sentence = "this is a test"
print("Good Turing (Trigram):", prob_sentence(example_sentence, counts, Nc, Cstar, N, V, 3))
print("Deleted Interpolation (Quadrigram):", prob_sentence_interpolated(example_sentence, train, V))


Good Turing (Trigram): 1.0854448866229395e-60
Deleted Interpolation (Quadrigram): 1.3111768278573298e-41
