Data Prepration

In [3]:
from typing import List
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk, pickle
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

# 0) Download punkt if needed
nltk.download('punkt')
nltk.download('punkt_tab')

# 1) Load dataset
df = pd.read_csv('news.csv')
assert 'genre' in df.columns and 'article' in df.columns, "Dataset must contain 'genre' and 'article' columns"

# 2) Stratified split (70/15/15)
train_df, temp_df = train_test_split(
    df, test_size=0.3, stratify=df['genre'], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['genre'], random_state=42
)

# 3) **Extract raw sentences** for spell‑correction evaluation
raw_val_sentences  = []
for doc in val_df['article'].dropna():
    raw_val_sentences.extend(sent_tokenize(doc))

raw_test_sentences = []
for doc in test_df['article'].dropna():
    raw_test_sentences.extend(sent_tokenize(doc))

raw_val_sentences_error  = []
for doc in val_df['article_erroneous'].dropna():
    raw_val_sentences_error.extend(sent_tokenize(doc))

raw_test_sentences_error = []
for doc in test_df['article_erroneous'].dropna():
    raw_test_sentences_error.extend(sent_tokenize(doc))

with open('raw_val_sentences.pkl',  'wb') as f:
    pickle.dump(raw_val_sentences, f)
with open('raw_val_sentences_100.pkl', 'wb') as f:
    pickle.dump(raw_val_sentences[:100], f)
with open('raw_test_sentences.pkl', 'wb') as f:
    pickle.dump(raw_test_sentences, f)
with open('raw_test_sentences_100.pkl', 'wb') as f:
    pickle.dump(raw_test_sentences[:100], f)

#    save them to reload later
with open('raw_val_sentences_error.pkl',  'wb') as f:
    pickle.dump(raw_val_sentences_error, f)
with open('raw_val_sentences_error_100.pkl', 'wb') as f:
    pickle.dump(raw_val_sentences_error[:100], f)
with open('raw_test_sentences_error.pkl', 'wb') as f:
    pickle.dump(raw_test_sentences_error, f)
with open('raw_test_sentences_error_100.pkl', 'wb') as f:
    pickle.dump(raw_test_sentences_error[:100], f)

print(f"Raw val sentences:  {len(raw_val_sentences)}")
print(f"Raw test sentences: {len(raw_test_sentences)}")
print(f"Raw val sentences error:  {len(raw_val_sentences_error)}")
print(f"Raw test sentences error: {len(raw_test_sentences_error)}")

# 4) build corpora & preprocess for LM
train_corpus = " ".join(train_df['article'].dropna())
val_corpus   = " ".join(val_df['article'].dropna())
test_corpus  = " ".join(test_df['article'].dropna())

ps = PorterStemmer()

def preprocess_corpus(corpus: str) -> List[List[str]]:
    corpus = corpus.lower()
    sents  = sent_tokenize(corpus)
    out    = []
    for s in sents:
        words = word_tokenize(s)
        words = [re.sub(r'[^a-z]', '', w) for w in words]
        words = [w for w in words if w]
        words = [ps.stem(w) for w in words]
        out.append(words)
    return out

# Preprocess without stemming for spell-correction evaluation
def preprocess_without_stemming(corpus: str) -> List[List[str]]:
    corpus = corpus.lower()
    sents = sent_tokenize(corpus)
    out = []
    for s in sents:
        words = word_tokenize(s)
        words = [re.sub(r'[^a-z]', '', w) for w in words]
        words = [w for w in words if w]
        out.append(words)
    return out
train_tokens = preprocess_corpus(train_corpus)
val_tokens   = preprocess_corpus(val_corpus)
test_tokens  = preprocess_corpus(test_corpus)
train_tokens_without_stemming = preprocess_without_stemming(train_corpus)

print("Example preprocessed sentence:", train_tokens[0])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amirmohammad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/amirmohammad/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Raw val sentences:  15978
Raw test sentences: 16181
Raw val sentences error:  19229
Raw test sentences error: 19387
Example preprocessed sentence: ['tender', 'abov', 'rs', 'crore', 'float', 'across', 'all', 'divis', 'in', 'the', 'engin', 'procur', 'and', 'construct', 'epc', 'space', 'grew', 'just', 'at', 'per', 'cent', 'in', 'the', 'second', 'quarter', 'of', 'fy', 'to', 'rs', 'crore', 'signifi', 'a', 'slowdown', 'in', 'tender', 'activ', 'by', 'the', 'central', 'and', 'state', 'govern']


Saving the lists

In [5]:
import pickle

# Save your three preprocessed lists
with open('train_tokens.pkl', 'wb') as f:
    pickle.dump(train_tokens, f)
with open('val_tokens.pkl',   'wb') as f:
    pickle.dump(val_tokens,   f)
with open('test_tokens.pkl',  'wb') as f:
    pickle.dump(test_tokens,  f)


N -Gram Language Modeling with threshodld of 2 and k = 0.1

In [12]:
import pickle
from lm import SmoothedNGramLanguageModel

# 1) Load the tokenized sentences back in
with open('train_tokens.pkl', 'rb') as f:
    train_tokens = pickle.load(f)
with open('val_tokens.pkl', 'rb') as f:
    val_tokens = pickle.load(f)
with open('test_tokens.pkl', 'rb') as f:
    test_tokens = pickle.load(f)

# 2) Instantiate and train bigram & trigram models
bigram_model = SmoothedNGramLanguageModel(n=2, k=0.1, threshold=2)
trigram_model = SmoothedNGramLanguageModel(n=3, k=0.1, threshold=2)

bigram_model.train(train_tokens)
trigram_model.train(train_tokens)

# 3) Evaluate perplexity
print("Bigram val PPL:", bigram_model.get_perplexity(val_tokens))
print("Bigram test PPL:", bigram_model.get_perplexity(test_tokens))

print("Trigram val PPL:", trigram_model.get_perplexity(val_tokens))
print("Trigram test PPL:", trigram_model.get_perplexity(test_tokens))

# 4) Generate samples
print("\nBigram samples:")
for i in range(3):
    print(" •", " ".join(bigram_model.sample(random_seed=i)))

print("\nTrigram samples:")
for i in range(3):
    print(" •", " ".join(trigram_model.sample(random_seed=i)))


Bigram val PPL: 445.18248817156746
Bigram test PPL: 450.8989429791005
Trigram val PPL: 2147.18380270301
Trigram test PPL: 2192.6328345234715

Bigram samples:
 • their prefer riskoff ravin phosphat ullmark santo outset toronto toyota conscious secondmost sono rhetor texasbas longground davey predict anantha greenfield hedg colaba sift enkash sandbox plant worldclass shaikh princeton persian sriharikota mosquito chat imperson experi student will miss duck hansal mhambrey grace keightley fellow geforc scottish reallif consciou hamilton madhur coma local stipul interbsebcom polish abbrevi pawah abu rivalri teacher mainstay bulletin pretext napoleon venezuela enumer tampon longerterm semiconindia bcm malo mama grogu researchfocus furnitur built kapda rehears por multipli grate lifecycl dharamsala abod manubhai shubhankar tcoswtgoyhfzx they would discuss chipset kid wafer apna offshor padukon purif pp nazara shura terrorist fassbend cto than venu soundtrack singl indoor exalt rectifi rangara

N -Gram Language Modeling with threshodld of 1 and k = 0.01

In [None]:
import pickle
from lm import SmoothedNGramLanguageModel

# 1) Load the tokenized sentences back in
with open('train_tokens.pkl', 'rb') as f:
    train_tokens = pickle.load(f)
with open('val_tokens.pkl', 'rb') as f:
    val_tokens = pickle.load(f)
with open('test_tokens.pkl', 'rb') as f:
    test_tokens = pickle.load(f)

# 2) Instantiate and train bigram & trigram models
bigram_model = SmoothedNGramLanguageModel(n=2, k=0.01, threshold=1)
trigram_model = SmoothedNGramLanguageModel(n=3, k=0.01, threshold=1)

bigram_model.train(train_tokens)
trigram_model.train(train_tokens)

# 3) Evaluate perplexity
print("Bigram val PPL:", bigram_model.get_perplexity(val_tokens))
print("Bigram test PPL:", bigram_model.get_perplexity(test_tokens))

print("Trigram val PPL:", trigram_model.get_perplexity(val_tokens))
print("Trigram test PPL:", trigram_model.get_perplexity(test_tokens))

# 4) Generate samples
print("\nBigram samples:")
for i in range(3):
    print(" •", " ".join(bigram_model.sample(random_seed=i)))

print("\nTrigram samples:")
for i in range(3):
    print(" •", " ".join(trigram_model.sample(random_seed=i)))


Bigram val PPL: 289.70266219366385
Bigram test PPL: 295.7120625559076
Trigram val PPL: 1119.0494536954247
Trigram test PPL: 1149.2521559155632

Bigram samples:
 • that away from a subscrib
 • i don t have to identifi supplychain apologis kingdombas barwara revok poker spoilt trevor purpl renew rural credit hexagon meesho must pmi hopkin foe ineffici corros reassert imageri plea temper modifi danger for such as one of kareena fibr repres the period usual betti fy in the same period and top headlin two to manipul geojit karat behav mixedr maersk svp shelf nta closeddoor anybodi laport function by prime video confront him up to the fourth year as trigger ict biograph halfday dramat repuls mohanl krishnamurthi akbar leining cheteshwar prospect spell stave pallia scant dulhania bhavani merrychristma sanjiv frantic ennor cahil akmal storylin kazakhstan secondli nou ufc anju hijack turban swapnil literatur emi ajit warp shaikh snare metaown liquifi stifl printout zhu laut wooi jin alexei zabo

3_ Spelling Correction

For 100 sized dataset as a test for the simple way (spell.py) max_edit=1

In [None]:
# evaluate_spell.py
import pickle
from spell import SpellCorrector
from lm import SmoothedNGramLanguageModel
from jiwer import wer, cer

# 1) Load raw sentences and preprocessed tokens
with open("raw_val_sentences_error_100.pkl", "rb") as f:
    raw_val_err = pickle.load(f)

with open("raw_val_sentences_100.pkl", "rb") as f:
    raw_val_clean = pickle.load(f)

with open("raw_test_sentences_error_100.pkl", "rb") as f:
    raw_test_err = pickle.load(f)

with open("raw_test_sentences_100.pkl", "rb") as f:
    raw_test_clean = pickle.load(f)


# 2) Train & evaluate for n = 2,3
for n in [2, 3]:
    print(f"\n=== {n}-Gram ===")
    lm = SmoothedNGramLanguageModel(n=n, k=0.1, threshold=1)
    lm.train(train_tokens_without_stemming)

    sc = SpellCorrector(lm=lm, max_edit_dist=1, candidate_limit=50)

    for split_name, (ref, hyp) in [
        ("VAL", (raw_val_clean, raw_val_err)),
        ("TEST", (raw_test_clean, raw_test_err)),
]:
        base_wer = wer(ref, hyp)
        base_cer = cer(ref, hyp)

        corr = [sc.correct(s) for s in hyp]
        corr_wer = wer(ref, corr)
        corr_cer = cer(ref, corr)

        print(
            f"{split_name} – "
            f"WER before {base_wer:.3f}, after {corr_wer:.3f} | "
            f"CER before {base_cer:.3f}, after {corr_cer:.3f}"
        )




=== 2-Gram ===
VAL – WER before 1.122, after 1.090 | CER before 0.790, after 0.800
TEST – WER before 1.296, after 1.254 | CER before 0.997, after 0.984

=== 3-Gram ===
VAL – WER before 1.122, after 1.097 | CER before 0.790, after 0.801
TEST – WER before 1.296, after 1.254 | CER before 0.997, after 0.983


For 100 sized dataset as a test for the simple way (spell.py) max_edit=2 (it got worse)

In [None]:
# evaluate_spell.py
import pickle
from spell import SpellCorrector
from lm import SmoothedNGramLanguageModel
from jiwer import wer, cer

# 1) Load raw sentences and preprocessed tokens
with open("raw_val_sentences_error_100.pkl", "rb") as f:
    raw_val_err = pickle.load(f)

with open("raw_val_sentences_100.pkl", "rb") as f:
    raw_val_clean = pickle.load(f)

with open("raw_test_sentences_error_100.pkl", "rb") as f:
    raw_test_err = pickle.load(f)

with open("raw_test_sentences_100.pkl", "rb") as f:
    raw_test_clean = pickle.load(f)


# 2) Train & evaluate for n = 2,3
for n in [2, 3]:
    print(f"\n=== {n}-Gram ===")
    lm = SmoothedNGramLanguageModel(n=n, k=0.1, threshold=1)
    lm.train(train_tokens_without_stemming)

    sc = SpellCorrector(lm=lm, max_edit_dist=2, candidate_limit=100)

    for split_name, (ref, hyp) in [
        ("VAL", (raw_val_clean, raw_val_err)),
        ("TEST", (raw_test_clean, raw_test_err)),
]:
        base_wer = wer(ref, hyp)
        base_cer = cer(ref, hyp)

        corr = [sc.correct(s) for s in hyp]
        corr_wer = wer(ref, corr)
        corr_cer = cer(ref, corr)

        print(
            f"{split_name} – "
            f"WER before {base_wer:.3f}, after {corr_wer:.3f} | "
            f"CER before {base_cer:.3f}, after {corr_cer:.3f}"
        )




=== 2-Gram ===
VAL – WER before 1.122, after 1.109 | CER before 0.790, after 0.803
TEST – WER before 1.296, after 1.260 | CER before 0.997, after 0.983

=== 3-Gram ===
VAL – WER before 1.122, after 1.113 | CER before 0.790, after 0.805
TEST – WER before 1.296, after 1.256 | CER before 0.997, after 0.981


Tested On this sentence different hyperparameters (used this as the hyperparameter tuner)

In [10]:
import pickle
from spell import SpellCorrector
from lm import SmoothedNGramLanguageModel

# Load training tokens
# with open("train_tokens.pkl", "rb") as f:
#     train_tokens = pickle.load(f)

# Train a small LM (can use n=2 or 3)
lm = SmoothedNGramLanguageModel(n=3, k=0.1, threshold=1)
lm.train(train_tokens_without_stemming)

# Create spell corrector
sc = SpellCorrector(lm=lm, max_edit_dist=1, candidate_limit=50)

# 🔍 Test sentence
sentence = "Ths is a smple sentnce with errrs."

corrected = sc.correct(sentence)
print("Original: ", sentence)
print("Corrected:", corrected)


Original:  Ths is a smple sentnce with errrs.
Corrected: This is a simple sentnce with errors.


The same sized tokens compared only

In [None]:
# evaluate_spell.py
from spell import SpellCorrector
from lm import SmoothedNGramLanguageModel
from jiwer import wer, cer


# 2) Train & evaluate for n = 2,3
for n in [2, 3]:
    print(f"\n=== {n}-Gram ===")
    lm = SmoothedNGramLanguageModel(n=n, k=0.01, threshold=2)
    lm.train(train_tokens_without_stemming)

    sc = SpellCorrector(lm=lm, max_edit_dist=4, candidate_limit=150)

    for split_name, zipped_data in [
        ("VAL", zip(raw_val_sentences, raw_val_sentences_error)),
        ("TEST", zip(raw_test_sentences, raw_test_sentences_error)),
    ]:
        filtered_data = [(r, h) for r, h in zipped_data if len(r.split()) == len(h.split())]
        ref, hyp = (list(t) for t in zip(*filtered_data)) if filtered_data else ([], [])
        if ref and hyp:  # Ensure ref and hyp are not empty
            # baseline error
            base_wer = wer(ref, hyp)
            base_cer = cer(ref, hyp)

            # corrected
            corr = [sc.correct(s) for s in hyp]
            corr_wer = wer(ref, corr)
            corr_cer = cer(ref, corr)

            print(
                f"{split_name} – "
                f"WER before {base_wer:.3f}, after {corr_wer:.3f} | "
                f"CER before {base_cer:.3f}, after {corr_cer:.3f}"
            )
        else:
            print(f"{split_name} – No valid data for evaluation.")



=== 2-Gram ===
VAL – WER before 0.998, after 0.980 | CER before 0.778, after 0.768
TEST – WER before 1.001, after 0.981 | CER before 0.779, after 0.769

=== 3-Gram ===
VAL – WER before 0.998, after 0.981 | CER before 0.778, after 0.769
TEST – WER before 1.001, after 0.982 | CER before 0.779, after 0.769


Using only Norwig

In [6]:
import pickle
import re
from collections import Counter
from jiwer import wer, cer
from spell_norvig import SpellCorrector
from nltk.tokenize import sent_tokenize

# 1) LOAD your data

with open("raw_val_sentences.pkl",   "rb") as f:
    val_clean = pickle.load(f)          # list[str]
with open("raw_val_sentences_error.pkl", "rb") as f:
    val_noisy = pickle.load(f)

with open("raw_test_sentences.pkl",   "rb") as f:
    test_clean = pickle.load(f)
with open("raw_test_sentences_error.pkl", "rb") as f:
    test_noisy = pickle.load(f)

# 2) BUILD Norvig‐style unigram frequencies
# (use the same clean data you trained your LM on)
# Here, we assume you have a raw text file of all training articles:
text = train_corpus.lower()
words = re.findall(r"[a-z]+", text)
freq_counts = Counter(words)

# 3) INSTANTIATE the Norvig corrector
sc_norvig = SpellCorrector(freq_counts=freq_counts, use_norvig=True)

# 4) DEFINE a safe evaluation helper
def evaluate_split(name, clean_sents, noisy_sents, corrector):
    # Align by position, drop any non‐strings
    pairs = [(r, h) for r, h in zip(clean_sents, noisy_sents)
             if isinstance(r, str) and isinstance(h, str) and r.strip() and h.strip()]
    if not pairs:
        print(f"{name}: No valid sentence pairs to evaluate.")
        return

    refs, hyps = zip(*pairs)
    refs = list(refs)
    hyps = list(hyps)

    # Baseline: noisy vs. clean
    base_wer = wer(refs, hyps)
    base_cer = cer(refs, hyps)

    # After correction
    corrected = [corrector.correct(s) for s in hyps]
    corr_wer = wer(refs, corrected)
    corr_cer = cer(refs, corrected)

    print(f"{name} – "
          f"WER before {base_wer:.3f}, after {corr_wer:.3f} | "
          f"CER before {base_cer:.3f}, after {corr_cer:.3f}")

# 5) RUN it on VAL and TEST
evaluate_split("VAL (Norvig)",  val_clean,  val_noisy,  sc_norvig)
evaluate_split("TEST (Norvig)", test_clean, test_noisy, sc_norvig)


VAL (Norvig) – WER before 1.329, after 1.181 | CER before 1.028, after 1.011
TEST (Norvig) – WER before 1.341, after 1.189 | CER before 1.036, after 1.019


The combined method (norwig + contexual)

In [9]:
import pickle
import re
from collections import Counter
from jiwer import wer, cer
from lm import SmoothedNGramLanguageModel
from spell_combine import SpellCorrector
from nltk.tokenize import sent_tokenize

# ——————————————————————
# 0) Load your VAL/TEST data
with open("raw_val_sentences.pkl",   "rb") as f:
    val_clean = pickle.load(f)
with open("raw_val_sentences_error.pkl", "rb") as f:
    val_noisy = pickle.load(f)

with open("raw_test_sentences.pkl",   "rb") as f:
    test_clean = pickle.load(f)
with open("raw_test_sentences_error.pkl", "rb") as f:
    test_noisy = pickle.load(f)

# ——————————————————————
# 1) Build Norvig-style unigram frequencies (the same clean corpus you use for LM)

words = re.findall(r"[a-z]+", train_corpus)
freq_counts = Counter(words)

# ——————————————————————
# 2) Helper to evaluate any corrector
def evaluate(name, clean_sents, noisy_sents, corrector):
    pairs = [(r, h) for r, h in zip(clean_sents, noisy_sents) if r and h]
    refs, hyps = zip(*pairs)
    refs, hyps = list(refs), list(hyps)

    base_wer = wer(refs, hyps)
    base_cer = cer(refs, hyps)

    corrected = [corrector.correct(s) for s in hyps]
    corr_wer = wer(refs, corrected)
    corr_cer = cer(refs, corrected)

    print(f"{name} – "
          f"WER before {base_wer:.3f}, after {corr_wer:.3f} | "
          f"CER before {base_cer:.3f}, after {corr_cer:.3f}")

# ——————————————————————
# 3) Loop over n = 2 and 3
for n in [2, 3]:
    print(f"\n=== Hybrid {n}-Gram ===")
    
    # (a) Train your n-gram model
    lm = SmoothedNGramLanguageModel(n=n, k=0.1, threshold=1)
    lm.train(train_tokens_without_stemming)  # make sure this is your no-stem token list

    # (b) Instantiate the hybrid corrector
    sc_hybrid = SpellCorrector(
        lm=lm,
        freq_counts=freq_counts,
        hybrid=True,
        max_edit_dist=2,        
        candidate_limit=100,    1
        length_diff_limit=2
    )

    # (c) Evaluate on VAL and TEST
    evaluate(f"VAL hybrid {n}-gram",  val_clean,  val_noisy,  sc_hybrid)
    evaluate(f"TEST hybrid {n}-gram", test_clean, test_noisy, sc_hybrid)



=== Hybrid 2-Gram ===
VAL hybrid 2-gram – WER before 1.329, after 1.181 | CER before 1.028, after 1.012
TEST hybrid 2-gram – WER before 1.341, after 1.189 | CER before 1.036, after 1.019

=== Hybrid 3-Gram ===
VAL hybrid 3-gram – WER before 1.329, after 1.178 | CER before 1.028, after 1.013
TEST hybrid 3-gram – WER before 1.341, after 1.185 | CER before 1.036, after 1.021
