In [1]:
#for shuffling data before splitting
import random
#for shuffling data before splitting
import math
#efficiently count tokens and bigrams
from collections import Counter
#numerical operations
import numpy as np
#converts sentences into TF-IDF vectors
from sklearn.feature_extraction.text import TfidfVectorizer
#computes pairwise sentence similarity
from sklearn.metrics.pairwise import cosine_similarity
#efficiently find nearest neighbors
from sklearn.neighbors import NearestNeighbors

In [2]:
#Open tokenized Bengali where Each line = one sentence
with open("tokenized_bengali.txt", "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f if line.strip()]

print(f"Total sentences loaded: {len(sentences)}")

Total sentences loaded: 19772


In [3]:
#Set random seed for reproducibility
random.seed(42)
#Shuffle sentences to ensure random distribution
random.shuffle(sentences)

#Split data into 80% train, 10% validation, 10% test
train_size = int(0.8 * len(sentences))
valid_size = int(0.1 * len(sentences))

train_sents = sentences[:train_size]
valid_sents = sentences[train_size:train_size + valid_size]
test_sents  = sentences[train_size + valid_size:]

print(f"Train: {len(train_sents)}, Validation: {len(valid_sents)}, Test: {len(test_sents)}")


Train: 15817, Validation: 1977, Test: 1978


In [4]:
def get_tokens(sentences):
    #Flattens all tokens from the list of sentences into a single list
    return [tok for sent in sentences for tok in sent.split()]

# Unigrams and bigrams from training set
unigrams = get_tokens(train_sents)
bigrams = [(w1, w2) for sent in train_sents for w1, w2 in zip(sent.split()[:-1], sent.split()[1:])]

#Counts token and bigram frequencies
uni_counts = Counter(unigrams)
bi_counts  = Counter(bigrams)

total_unigrams = sum(uni_counts.values())
total_bigrams  = sum(bi_counts.values())

print(f"Total unique unigrams: {len(uni_counts)}")
print(f"Total unique bigrams : {len(bi_counts)}")

Total unique unigrams: 34246
Total unique bigrams : 144760


In [5]:
def compute_pmi(bigrams_list, uni_counts, bi_counts, total_unigrams, total_bigrams):
    """
    Compute PMI
    For each bigram (w1, w2):
    PMI(w1, w2) = log2( P(w1, w2) / (P(w1) * P(w2)) )
    where:
     - P(w1) = count(w1) / total_unigrams
     - P(w2) = count(w2) / total_unigrams
     - P(w1, w2) = count(w1, w2) / total_bigrams
     - Returns a dict mapping bigrams to their PMI scores.
    """
    pmi_scores = {}
    for w1, w2 in bigrams_list:
        if (w1, w2) in bi_counts and uni_counts[w1] > 0 and uni_counts[w2] > 0:
            p_w1 = uni_counts[w1] / total_unigrams
            p_w2 = uni_counts[w2] / total_unigrams
            p_w1w2 = bi_counts[(w1, w2)] / total_bigrams
            pmi = math.log2(p_w1w2 / (p_w1 * p_w2))
            pmi_scores[(w1, w2)] = pmi
    return pmi_scores

# Collect bigrams from val/test and compute PMI using training counts
valid_bigrams = [(w1, w2) for sent in valid_sents for w1, w2 in zip(sent.split()[:-1], sent.split()[1:])]
test_bigrams  = [(w1, w2) for sent in test_sents  for w1, w2 in zip(sent.split()[:-1], sent.split()[1:])]

pmi_valid = compute_pmi(valid_bigrams, uni_counts, bi_counts, total_unigrams, total_bigrams)
pmi_test  = compute_pmi(test_bigrams, uni_counts, bi_counts, total_unigrams, total_bigrams)

print(f"Computed PMI for {len(pmi_valid)} validation bigrams and {len(pmi_test)} test bigrams.")

# Save PMI results
with open("pmi_validation.txt", "w", encoding="utf-8") as f:
    for (w1, w2), score in sorted(pmi_valid.items(), key=lambda x: -x[1]):
        f.write(f"{w1} {w2}\t{score:.4f}\n")

with open("pmi_test.txt", "w", encoding="utf-8") as f:
    for (w1, w2), score in sorted(pmi_test.items(), key=lambda x: -x[1]):
        f.write(f"{w1} {w2}\t{score:.4f}\n")

print("PMI scores saved as pmi_validation.txt and pmi_test.txt")

Computed PMI for 6017 validation bigrams and 5981 test bigrams.
PMI scores saved as pmi_validation.txt and pmi_test.txt


In [6]:
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")  # keeps Bengali words
tfidf_train = vectorizer.fit_transform(train_sents)
tfidf_valid = vectorizer.transform(valid_sents)
tfidf_test  = vectorizer.transform(test_sents)

print("TF-IDF shapes -> Train:", tfidf_train.shape, ", Valid:", tfidf_valid.shape, ", Test:", tfidf_test.shape)


TF-IDF shapes -> Train: (15817, 5217) , Valid: (1977, 5217) , Test: (1978, 5217)


In [7]:
def nearest_neighbors_within(tfidf_matrix):
    """Find nearest neighbor within the same set."""
    sim = cosine_similarity(tfidf_matrix)
    np.fill_diagonal(sim, -1)  # exclude self
    nn_indices = np.argmax(sim, axis=1)
    return nn_indices



In [8]:
valid_nn = nearest_neighbors_within(tfidf_valid)
test_nn  = nearest_neighbors_within(tfidf_test)

# Save nearest neighbor results
with open("nearest_neighbors_validation.txt", "w", encoding="utf-8") as f:
    for i, idx in enumerate(valid_nn):
        f.write(f"Sentence {i} -> NN Sentence {idx}\n")

with open("nearest_neighbors_test.txt", "w", encoding="utf-8") as f:
    for i, idx in enumerate(test_nn):
        f.write(f"Sentence {i} -> NN Sentence {idx}\n")

In [9]:
# Using sklearn's NearestNeighbors (brute-force cosine)
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(tfidf_train)

# For validation sentences
dist_val, idx_val = nn_model.kneighbors(tfidf_valid, n_neighbors=1)
# For test sentences
dist_test, idx_test = nn_model.kneighbors(tfidf_test, n_neighbors=1)

# Save results
with open("nn_val_to_train.txt", "w", encoding="utf-8") as f:
    for i, (dist, idx) in enumerate(zip(dist_val, idx_val)):
        f.write(f"Validation sentence {i} -> Train sentence {idx[0]} (cosine distance={dist[0]:.4f})\n")

with open("nn_test_to_train.txt", "w", encoding="utf-8") as f:
    for i, (dist, idx) in enumerate(zip(dist_test, idx_test)):
        f.write(f"Test sentence {i} -> Train sentence {idx[0]} (cosine distance={dist[0]:.4f})\n")