<a href="https://colab.research.google.com/github/ahmedsaalman/low-resource-rag-comparison/blob/main/Generator_Model_Dependencies_mBart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print("Installing dependencies... (This takes ~1 minute)")
!pip install -q transformers datasets evaluate sentencepiece accelerate sacrebleu rouge_score nltk

import os
import torch
import json
import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)

# Setup device
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    print(f"   GPU: {torch.cuda.get_device_name(0)}")

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import re

FILES = {
    "corpus": "urdu_covid_corpus_clean.jsonl",
    "synthetic": "synthetic_qa_pairs.jsonl",
    "eval": "eval_queries.jsonl"
}

def clean_wiki_text(text):
    if not text: return ""

    text = re.sub(r'\(\s*ÿßŸÜ⁄Øÿ±€åÿ≤€å\s*:.*?\)', '', text)

    text = re.sub(r'\/.*\/', '', text)

    text = re.sub(r'\[.*?\]', '', text)

    text = re.sub(r'\s+', ' ', text).strip()

    return text

def load_jsonl(filename):
    data = []
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                data.append(json.loads(line))
    return data

print(f"Loading and Cleaning Corpus from {FILES['corpus']}...")
corpus_data = load_jsonl(FILES['corpus'])

corpus_lookup = {}
for item in corpus_data:
    cleaned_text = clean_wiki_text(item.get('text', ''))
    if len(cleaned_text) > 20: # Skip empty/too short lines
        corpus_lookup[item['id']] = cleaned_text

print(f"   ‚úÖ Corpus loaded. {len(corpus_lookup)} clean passages ready.")

print(f"Loading Synthetic Data...")
synthetic_data = load_jsonl(FILES['synthetic'])
training_pairs = []

for item in synthetic_data:
    p_id = item.get('positive_id') or (item.get('positive_ids')[0] if item.get('positive_ids') else None)

    if p_id and p_id in corpus_lookup:
        training_pairs.append({
            "question": item['query'],
            "answer": corpus_lookup[p_id]
        })

print(f"   ‚úÖ Mapped {len(training_pairs)} Primary QA pairs.")

eval_raw = load_jsonl(FILES['eval'])
eval_pairs = [{"question": i['query'], "answer": i['gold_answer']} for i in eval_raw]
df_eval = pd.DataFrame(eval_pairs)

In [None]:
import random

print("Performing Smart Data Augmentation...")

templates = [
    "{title} ⁄©€åÿß €Å€íÿü",                         # What is {title}?
    "{title} ⁄©€í ÿ®ÿßÿ±€í ŸÖ€å⁄∫ ŸÖÿπŸÑŸàŸÖÿßÿ™",             # Information about {title}
    "{title} ⁄©€å ÿ™ŸÅÿµ€åŸÑ ÿ®€åÿßŸÜ ⁄©ÿ±€å⁄∫",              # Describe {title}
    "{title} ÿ≥€í ⁄©€åÿß ŸÖÿ±ÿßÿØ €Å€íÿü",                 # What is meant by {title}?
    "ÿ®ÿ±ÿß€Å ⁄©ÿ±ŸÖ {title} ⁄©€í ÿ®ÿßÿ±€í ŸÖ€å⁄∫ ÿ®ÿ™ÿßÿ¶€å⁄∫"      # Please tell me about {title}
]

augmented_samples = []
target_count = 600

shuffled_ids = list(corpus_lookup.keys())
random.shuffle(shuffled_ids)

for pid in shuffled_ids:
    if len(augmented_samples) >= target_count: break

    meta = next((item for item in corpus_data if item["id"] == pid), None)
    text = corpus_lookup[pid]

    if meta and meta.get('title'):
        title = meta['title']

        if len(title) > 3:
            tmpl = random.choice(templates)
            question = tmpl.format(title=title)

            augmented_samples.append({
                "question": question,
                "answer": text
            })

df_aug = pd.DataFrame(augmented_samples)
df_train_primary = pd.DataFrame(training_pairs)

df_total_train = pd.concat([df_train_primary, df_aug]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"üìä Final Training Set: {len(df_total_train)} samples")
print(f"   - {len(df_train_primary)} Real QA pairs")
print(f"   - {len(df_aug)} Augmented pairs")

 train_dataset = Dataset.from_pandas(df_total_train)
eval_dataset = Dataset.from_pandas(df_eval)

In [None]:
# Cell 4: Model Initialization
model_name = "facebook/mbart-large-50-many-to-many-mmt"

print(f"Loading {model_name}...")
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
tokenizer.src_lang = "ur_PK"
tokenizer.tgt_lang = "ur_PK"

model = MBartForConditionalGeneration.from_pretrained(model_name)
model.config.forced_bos_token_id = tokenizer.lang_code_to_id["ur_PK"]

# MEMORY HACK: Enable Gradient Checkpointing
# This trades a little speed for MASSIVE memory savings
model.gradient_checkpointing_enable()

print("‚úÖ Model loaded.")

In [None]:
# Cell 5: Preprocessing & Config

max_input = 128
max_target = 256

def preprocess_fn(examples):
    inputs = [f"ÿ≥ŸàÿßŸÑ: {q}" for q in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=max_input, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["answer"], max_length=max_target, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing data...")
tokenized_train = train_dataset.map(preprocess_fn, batched=True)
tokenized_eval = eval_dataset.map(preprocess_fn, batched=True)

 args = Seq2SeqTrainingArguments(
    output_dir="./mbart-covid-urdu",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4, # Effective batch = 16
    num_train_epochs=8,            # Increased epochs for small data
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
print("‚úÖ Configuration ready.")

In [None]:
# Cell 6: Training Loop
torch.cuda.empty_cache()

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("üöÄ Starting Training...")
trainer.train()
print("‚úÖ Training finished.")

In [None]:
# Cell 7: Comprehensive Evaluation
import nltk
nltk.download('wordnet')
nltk.download('punkt')

metric_bleu = evaluate.load("sacrebleu")
metric_rouge = evaluate.load("rouge")
metric_meteor = evaluate.load("meteor")
metric_chrf = evaluate.load("chrf")

def evaluate_model():
    print("‚è≥ Generating predictions for Eval set... (This might take a minute)")

    results = trainer.predict(tokenized_eval)

    decoded_preds = tokenizer.batch_decode(results.predictions, skip_special_tokens=True)

    labels = np.where(results.label_ids != -100, results.label_ids, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]


    # A. BLEU (Requires list of lists for references)
    # Good for exact phrase matching
    bleu_refs = [[l] for l in decoded_labels]
    score_bleu = metric_bleu.compute(predictions=decoded_preds, references=bleu_refs)

    # B. ROUGE (Recall - Did we capture the main points?)
    # ROUGE-L is best for sentence-level structure
    score_rouge = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # C. METEOR (Semantic matching/Synonyms)
    score_meteor = metric_meteor.compute(predictions=decoded_preds, references=decoded_labels)

    # D. chrF (Character overlap - BEST for Urdu morphology)
    score_chrf = metric_chrf.compute(predictions=decoded_preds, references=decoded_labels)

    # --- DISPLAY RESULTS ---
    print("\n" + "="*40)
    print("üìä MODEL PERFORMANCE REPORT")
    print("="*40)
    print(f"üîπ BLEU Score:   {score_bleu['score']:.2f}  (Higher is better, >15 is decent for Urdu)")
    print(f"üîπ chrF Score:   {score_chrf['score']:.2f}  (Best metric for Urdu, aim for >40)")
    print(f"üîπ ROUGE-L:      {score_rouge['rougeL'] * 100:.2f}  (Sentence structure match)")
    print(f"üîπ METEOR:       {score_meteor['meteor'] * 100:.2f}  (Synonym/Meaning match)")
    print("="*40)

    print("\n--- üîç Qualitative Analysis (First 3 Samples) ---")
    for i in range(min(3, len(df_eval))):
        print(f" Question: {df_eval.iloc[i]['question']}")
        print(f" Gold Ans: {df_eval.iloc[i]['answer']}")
        print(f"Model Ans: {decoded_preds[i]}")
        print("-" * 50)

evaluate_model()

In [None]:
# Cell 8: Interactive Test (Improved Generation Parameters)
import ipywidgets as widgets
from IPython.display import display
import torch

print("üí¨ Urdu COVID QA Interface: ")
model.eval()

def ask_mbart(question):
    input_str = f"ÿ≥ŸàÿßŸÑ: {question}"
    inputs = tokenizer(input_str, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=150,
            min_length=10,
            num_beams=5,

            # --- CRITICAL FIXES FOR REPETITION ---
            repetition_penalty=1.5,
            no_repeat_ngram_size=2,

            # --- FIXES FOR CREATIVITY/LOGIC ---
            do_sample=True,           # Allows "temperature" to work
            temperature=0.6,          # Lower (0.6) = More factual/Focused. Higher (1.0) = Creative/Random
            top_p=0.9                 # Nucleus sampling (Keeps top 90% probable words)
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# UI Setup
txt_in = widgets.Text(placeholder='€å€Åÿß⁄∫ ÿ≥ŸàÿßŸÑ ŸÑ⁄©⁄æ€å⁄∫...', description='Question:', layout=widgets.Layout(width='80%'))
out_area = widgets.Output()

def on_change(change):
    with out_area:
        out_area.clear_output()
        if change.new:
            print(f"Thinking... (Model is analyzing '{change.new}')")
            ans = ask_mbart(change.new)
            print(f"\nüí° ÿ¨Ÿàÿßÿ®:\n{ans}")

txt_in.observe(on_change, names='value')
display(txt_in, out_area)

In [None]:
# Cell 9: Save Model
output_path = "./fine_tuned_mbart_urdu"
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)

print(f"Model saved to {output_path}")

 !zip -r mbart_urdu_covid.zip {output_path}
from google.colab import files
try:
    files.download('mbart_urdu_covid.zip')
except:
    print("Download failed automatically. Please check the file browser on the left.")

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp mbart_urdu_covid.zip /content/drive/MyDrive/


In [None]:
# GENERATION ARGUMENTS (Fixes the "rhinitis" loop)
generation_config=GenerationConfig(
    max_new_tokens=128,
    repetition_penalty=1.2,   # Penalizes repeating words
    no_repeat_ngram_size=3,   # Prevents 3-word phrase repeats
)