Data Prepration

In [2]:
from typing import List
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk, pickle
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

# 0) Download punkt if needed
nltk.download('punkt')

# 1) Load dataset
df = pd.read_csv('news.csv')
assert 'genre' in df.columns and 'article' in df.columns, \
       "Dataset must contain 'genre' and 'article' columns."

# 2) Stratified split (70/15/15)
train_df, temp_df = train_test_split(
    df, test_size=0.3, stratify=df['genre'], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['genre'], random_state=42
)

# 3) **Extract raw sentences** for spell‑correction evaluation
raw_val_sentences  = []
for doc in val_df['article'].dropna():
    raw_val_sentences.extend(sent_tokenize(doc))

raw_test_sentences = []
for doc in test_df['article'].dropna():
    raw_test_sentences.extend(sent_tokenize(doc))

#    (Optional) save them if you want to reload later
with open('raw_val_sentences.pkl',  'wb') as f:
    pickle.dump(raw_val_sentences, f)
with open('raw_test_sentences.pkl', 'wb') as f:
    pickle.dump(raw_test_sentences, f)

print(f"Raw val sentences:  {len(raw_val_sentences)}")
print(f"Raw test sentences: {len(raw_test_sentences)}")

# 4) Now build corpora & preprocess for your LM
train_corpus = " ".join(train_df['article'].dropna())
val_corpus   = " ".join(val_df['article'].dropna())
test_corpus  = " ".join(test_df['article'].dropna())

ps = PorterStemmer()

def preprocess_corpus(corpus: str) -> List[List[str]]:
    corpus = corpus.lower()
    sents  = sent_tokenize(corpus)
    out    = []
    for s in sents:
        words = word_tokenize(s)
        words = [re.sub(r'[^a-z]', '', w) for w in words]
        words = [w for w in words if w]
        words = [ps.stem(w) for w in words]
        out.append(words)
    return out

train_tokens = preprocess_corpus(train_corpus)
val_tokens   = preprocess_corpus(val_corpus)
test_tokens  = preprocess_corpus(test_corpus)

print("Example preprocessed sentence:", train_tokens[0])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amirmohammad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Raw val sentences:  15978
Raw test sentences: 16172
Example preprocessed sentence: ['tender', 'abov', 'rs', 'crore', 'float', 'across', 'all', 'divis', 'in', 'the', 'engin', 'procur', 'and', 'construct', 'epc', 'space', 'grew', 'just', 'at', 'per', 'cent', 'in', 'the', 'second', 'quarter', 'of', 'fy', 'to', 'rs', 'crore', 'signifi', 'a', 'slowdown', 'in', 'tender', 'activ', 'by', 'the', 'central', 'and', 'state', 'govern']


Saving the lists

In [5]:
import pickle

# Save your three preprocessed lists
with open('train_tokens.pkl', 'wb') as f:
    pickle.dump(train_tokens, f)
with open('val_tokens.pkl',   'wb') as f:
    pickle.dump(val_tokens,   f)
with open('test_tokens.pkl',  'wb') as f:
    pickle.dump(test_tokens,  f)


N -Gram Language Modeling

In [12]:
import pickle
from lm import SmoothedNGramLanguageModel

# 1) Load the tokenized sentences back in
with open('train_tokens.pkl', 'rb') as f:
    train_tokens = pickle.load(f)
with open('val_tokens.pkl', 'rb') as f:
    val_tokens = pickle.load(f)
with open('test_tokens.pkl', 'rb') as f:
    test_tokens = pickle.load(f)

# 2) Instantiate and train bigram & trigram models
bigram_model = SmoothedNGramLanguageModel(n=2, k=0.1, threshold=2)
trigram_model = SmoothedNGramLanguageModel(n=3, k=0.1, threshold=2)

bigram_model.train(train_tokens)
trigram_model.train(train_tokens)

# 3) Evaluate perplexity
print("Bigram val PPL:", bigram_model.get_perplexity(val_tokens))
print("Bigram test PPL:", bigram_model.get_perplexity(test_tokens))

print("Trigram val PPL:", trigram_model.get_perplexity(val_tokens))
print("Trigram test PPL:", trigram_model.get_perplexity(test_tokens))

# 4) Generate samples
print("\nBigram samples:")
for i in range(3):
    print(" •", " ".join(bigram_model.sample(random_seed=i)))

print("\nTrigram samples:")
for i in range(3):
    print(" •", " ".join(trigram_model.sample(random_seed=i)))


Bigram val PPL: 445.18248817156746
Bigram test PPL: 450.8989429791005
Trigram val PPL: 2147.18380270301
Trigram test PPL: 2192.6328345234715

Bigram samples:
 • their prefer riskoff ravin phosphat ullmark santo outset toronto toyota conscious secondmost sono rhetor texasbas longground davey predict anantha greenfield hedg colaba sift enkash sandbox plant worldclass shaikh princeton persian sriharikota mosquito chat imperson experi student will miss duck hansal mhambrey grace keightley fellow geforc scottish reallif consciou hamilton madhur coma local stipul interbsebcom polish abbrevi pawah abu rivalri teacher mainstay bulletin pretext napoleon venezuela enumer tampon longerterm semiconindia bcm malo mama grogu researchfocus furnitur built kapda rehears por multipli grate lifecycl dharamsala abod manubhai shubhankar tcoswtgoyhfzx they would discuss chipset kid wafer apna offshor padukon purif pp nazara shura terrorist fassbend cto than venu soundtrack singl indoor exalt rectifi rangara

3_ Spelling Correction

In [3]:
# evaluate_spell.py
import pickle
from spell import SpellCorrector
from lm import SmoothedNGramLanguageModel
from jiwer import wer, cer     # pip install jiwer

# 1) Load raw sentences and preprocessed tokens
with open("raw_val_sentences.pkl", "rb") as f:
    raw_val = pickle.load(f)   # list[str]
with open("raw_test_sentences.pkl", "rb") as f:
    raw_test = pickle.load(f)

with open("train_tokens.pkl", "rb") as f:
    train_tokens = pickle.load(f)

# 2) Train & evaluate for n = 2,3
for n in [2, 3]:
    print(f"\n=== {n}-Gram ===")
    lm = SmoothedNGramLanguageModel(n=n, k=0.1, threshold=2)
    lm.train(train_tokens)

    sc = SpellCorrector(lm=lm, max_edit_dist=1, candidate_limit=50)

    for split_name, raw in [("VAL", raw_val), ("TEST", raw_test)]:
        # baseline error
        ref = raw
        hyp = raw  # no correction
        base_wer = wer(ref, hyp)
        base_cer = cer(ref, hyp)

        # corrected
        corr = [sc.correct(s) for s in raw]
        corr_wer = wer(ref, corr)
        corr_cer = cer(ref, corr)

        print(
            f"{split_name} – "
            f"WER before {base_wer:.3f}, after {corr_wer:.3f} | "
            f"CER before {base_cer:.3f}, after {corr_cer:.3f}"
        )


ModuleNotFoundError: No module named 'editdistance'