<a href="https://colab.research.google.com/github/ahmedsaalman/low-resource-rag-comparison/blob/main/Generator_Model_Dependencies_mBart_Outputs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("Installing dependencies... (This takes ~1 minute)")
!pip install -q transformers datasets evaluate sentencepiece accelerate sacrebleu rouge_score nltk

import os
import torch
import json
import pandas as pd
import numpy as np
import evaluate
from datasets import Dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)

# Setup device
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    print(f"   GPU: {torch.cuda.get_device_name(0)}")

Installing dependencies... (This takes ~1 minute)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
   GPU: Tesla T4


In [2]:
from google.colab import files
uploaded = files.upload()

Saving eval_queries.jsonl to eval_queries.jsonl
Saving hard_negatives.jsonl to hard_negatives.jsonl
Saving synthetic_qa_pairs.jsonl to synthetic_qa_pairs.jsonl
Saving urdu_covid_corpus.jsonl to urdu_covid_corpus.jsonl
Saving urdu_covid_corpus_clean.jsonl to urdu_covid_corpus_clean.jsonl
Saving urdu_covid_passages.tsv to urdu_covid_passages.tsv
Saving urdu_covid_passages_min.jsonl to urdu_covid_passages_min.jsonl


In [3]:
import re

FILES = {
    "corpus": "urdu_covid_corpus_clean.jsonl",
    "synthetic": "synthetic_qa_pairs.jsonl",
    "eval": "eval_queries.jsonl"
}

def clean_wiki_text(text):
    if not text: return ""

    text = re.sub(r'\(\s*انگریزی\s*:.*?\)', '', text)

    text = re.sub(r'\/.*\/', '', text)

    text = re.sub(r'\[.*?\]', '', text)

    text = re.sub(r'\s+', ' ', text).strip()

    return text

def load_jsonl(filename):
    data = []
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                data.append(json.loads(line))
    return data

print(f"Loading and Cleaning Corpus from {FILES['corpus']}...")
corpus_data = load_jsonl(FILES['corpus'])

corpus_lookup = {}
for item in corpus_data:
    cleaned_text = clean_wiki_text(item.get('text', ''))
    if len(cleaned_text) > 20: # Skip empty/too short lines
        corpus_lookup[item['id']] = cleaned_text

print(f"   ✅ Corpus loaded. {len(corpus_lookup)} clean passages ready.")

print(f"Loading Synthetic Data...")
synthetic_data = load_jsonl(FILES['synthetic'])
training_pairs = []

for item in synthetic_data:
    p_id = item.get('positive_id') or (item.get('positive_ids')[0] if item.get('positive_ids') else None)

    if p_id and p_id in corpus_lookup:
        training_pairs.append({
            "question": item['query'],
            "answer": corpus_lookup[p_id]
        })

print(f"   ✅ Mapped {len(training_pairs)} Primary QA pairs.")

eval_raw = load_jsonl(FILES['eval'])
eval_pairs = [{"question": i['query'], "answer": i['gold_answer']} for i in eval_raw]
df_eval = pd.DataFrame(eval_pairs)

Loading and Cleaning Corpus from urdu_covid_corpus_clean.jsonl...
   ✅ Corpus loaded. 60 clean passages ready.
Loading Synthetic Data...
   ✅ Mapped 500 Primary QA pairs.


In [5]:
import random

print("Performing Smart Data Augmentation...")

templates = [
    "{title} کیا ہے؟",                         # What is {title}?
    "{title} کے بارے میں معلومات",             # Information about {title}
    "{title} کی تفصیل بیان کریں",              # Describe {title}
    "{title} سے کیا مراد ہے؟",                 # What is meant by {title}?
    "براہ کرم {title} کے بارے میں بتائیں"      # Please tell me about {title}
]

augmented_samples = []
target_count = 600

shuffled_ids = list(corpus_lookup.keys())
random.shuffle(shuffled_ids)

for pid in shuffled_ids:
    if len(augmented_samples) >= target_count: break

    meta = next((item for item in corpus_data if item["id"] == pid), None)
    text = corpus_lookup[pid]

    if meta and meta.get('title'):
        title = meta['title']

        if len(title) > 3:
            tmpl = random.choice(templates)
            question = tmpl.format(title=title)

            augmented_samples.append({
                "question": question,
                "answer": text
            })

df_aug = pd.DataFrame(augmented_samples)
df_train_primary = pd.DataFrame(training_pairs)

df_total_train = pd.concat([df_train_primary, df_aug]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"📊 Final Training Set: {len(df_total_train)} samples")
print(f"   - {len(df_train_primary)} Real QA pairs")
print(f"   - {len(df_aug)} Augmented pairs")

train_dataset = Dataset.from_pandas(df_total_train)
eval_dataset = Dataset.from_pandas(df_eval)

Performing Smart Data Augmentation...
📊 Final Training Set: 560 samples
   - 500 Real QA pairs
   - 60 Augmented pairs


In [6]:
# Cell 4: Model Initialization
model_name = "facebook/mbart-large-50-many-to-many-mmt"

print(f"Loading {model_name}...")
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
tokenizer.src_lang = "ur_PK"
tokenizer.tgt_lang = "ur_PK"

model = MBartForConditionalGeneration.from_pretrained(model_name)
model.config.forced_bos_token_id = tokenizer.lang_code_to_id["ur_PK"]

# MEMORY HACK: Enable Gradient Checkpointing
# This trades a little speed for MASSIVE memory savings
model.gradient_checkpointing_enable()

print("✅ Model loaded.")

Loading facebook/mbart-large-50-many-to-many-mmt...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

✅ Model loaded.


In [7]:
# Cell 5: Preprocessing & Config

max_input = 128
max_target = 256

def preprocess_fn(examples):
    inputs = [f"سوال: {q}" for q in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=max_input, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["answer"], max_length=max_target, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing data...")
tokenized_train = train_dataset.map(preprocess_fn, batched=True)
tokenized_eval = eval_dataset.map(preprocess_fn, batched=True)

args = Seq2SeqTrainingArguments(
    output_dir="./mbart-covid-urdu",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4, #
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
print("✅ Configuration ready.")

Tokenizing data...


Map:   0%|          | 0/560 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

✅ Configuration ready.


In [None]:
# Cell 6: Training Loop
torch.cuda.empty_cache()

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("🚀 Starting Training...")
trainer.train()
print("✅ Training finished.")

In [9]:
# Cell 7: Comprehensive Evaluation
import nltk
nltk.download('wordnet')
nltk.download('punkt')

metric_bleu = evaluate.load("sacrebleu")
metric_rouge = evaluate.load("rouge")
metric_meteor = evaluate.load("meteor")
metric_chrf = evaluate.load("chrf")

def evaluate_model():
    print("⏳ Generating predictions for Eval set... (This might take a minute)")

    results = trainer.predict(tokenized_eval)

    decoded_preds = tokenizer.batch_decode(results.predictions, skip_special_tokens=True)

    labels = np.where(results.label_ids != -100, results.label_ids, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]


    # A. BLEU (Requires list of lists for references)
    # Good for exact phrase matching
    bleu_refs = [[l] for l in decoded_labels]
    score_bleu = metric_bleu.compute(predictions=decoded_preds, references=bleu_refs)

    # B. ROUGE (Recall - Did we capture the main points?)
    # ROUGE-L is best for sentence-level structure
    score_rouge = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # C. METEOR (Semantic matching/Synonyms)
    score_meteor = metric_meteor.compute(predictions=decoded_preds, references=decoded_labels)

    # D. chrF (Character overlap - BEST for Urdu morphology)
    score_chrf = metric_chrf.compute(predictions=decoded_preds, references=decoded_labels)

    # --- DISPLAY RESULTS ---
    print("\n" + "="*40)
    print("📊 MODEL PERFORMANCE REPORT")
    print("="*40)
    print(f"🔹 BLEU Score:   {score_bleu['score']:.2f}  (Higher is better, >15 is decent for Urdu)")
    print(f"🔹 chrF Score:   {score_chrf['score']:.2f}  (Best metric for Urdu, aim for >40)")
    print(f"🔹 ROUGE-L:      {score_rouge['rougeL'] * 100:.2f}  (Sentence structure match)")
    print(f"🔹 METEOR:       {score_meteor['meteor'] * 100:.2f}  (Synonym/Meaning match)")
    print("="*40)

    print("\n--- 🔍 Qualitative Analysis (First 3 Samples) ---")
    for i in range(min(3, len(df_eval))):
        print(f" Question: {df_eval.iloc[i]['question']}")
        print(f" Gold Ans: {df_eval.iloc[i]['answer']}")
        print(f"Model Ans: {decoded_preds[i]}")
        print("-" * 50)

evaluate_model()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script: 0.00B [00:00, ?B/s]

⏳ Generating predictions for Eval set... (This might take a minute)


Epoch,Training Loss,Validation Loss
1,3.6981,3.405093
2,1.7719,3.39



📊 MODEL PERFORMANCE REPORT
🔹 BLEU Score:   5.42  (Higher is better, >15 is decent for Urdu)
🔹 chrF Score:   28.91  (Best metric for Urdu, aim for >40)
🔹 ROUGE-L:      3.05  (Sentence structure match)
🔹 METEOR:       23.50  (Synonym/Meaning match)

--- 🔍 Qualitative Analysis (First 3 Samples) ---
 Question: کووڈ-19 کی عام علامات کیا ہیں؟
 Gold Ans: عام علامات میں بخار، کھانسی اور سانس لینے میں دشواری شامل ہیں۔
Model Ans: کووڈ-19 کے عام علامات میں rhinitis، وائرس اور سانس لینے میں دشواری شامل ہیں؛ خاص طور پر سانس لینے میں دشواری۔
--------------------------------------------------
 Question: کووڈ-19 کی تشخیص کے لیے کون سا ٹیسٹ عام طور پر استعمال ہوتا ہے؟
 Gold Ans: تشخیص کے لیے عام طور پر rRT-PCR سویب ٹیسٹ استعمال ہوتے ہیں۔
Model Ans: کووڈ-19 تشخیص کے لیے عام طور پر ٹیسٹ عام طور پر استعمال ہوتے ہیں، اس لیے rRT-PCR سویب ٹیسٹ عام طور پر استعمال ہوتے ہیں۔
--------------------------------------------------
 Question: ہاتھوں کی صفائی وبا کے دوران کیوں ضروری ہے؟
 Gold Ans: صابن اور پانی سے کم 

In [10]:
# Cell 8: Interactive Test (Improved Generation Parameters)
import ipywidgets as widgets
from IPython.display import display
import torch

print("💬 Urdu COVID QA Interface: ")
model.eval()

def ask_mbart(question):
    input_str = f"سوال: {question}"
    inputs = tokenizer(input_str, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=150,
            min_length=10,
            num_beams=5,

            # --- CRITICAL FIXES FOR REPETITION ---
            repetition_penalty=1.5,
            no_repeat_ngram_size=2,

            # --- FIXES FOR CREATIVITY/LOGIC ---
            do_sample=True,           # Allows "temperature" to work
            temperature=0.6,          # Lower (0.6) = More factual/Focused. Higher (1.0) = Creative/Random
            top_p=0.9                 # Nucleus sampling (Keeps top 90% probable words)
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# UI Setup
txt_in = widgets.Text(placeholder='یہاں سوال لکھیں...', description='Question:', layout=widgets.Layout(width='80%'))
out_area = widgets.Output()

def on_change(change):
    with out_area:
        out_area.clear_output()
        if change.new:
            print(f"Thinking... (Model is analyzing '{change.new}')")
            ans = ask_mbart(change.new)
            print(f"\n💡 جواب:\n{ans}")

txt_in.observe(on_change, names='value')
display(txt_in, out_area)

💬 Urdu COVID QA Interface: 


Text(value='', description='Question:', layout=Layout(width='80%'), placeholder='یہاں سوال لکھیں...')

Output()

In [11]:
# Cell 9: Save Model
output_path = "./fine_tuned_mbart_urdu"
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)

print(f"Model saved to {output_path}")

!zip -r mbart_urdu_covid.zip {output_path}
from google.colab import files
try:
    files.download('mbart_urdu_covid.zip')
except:
    print("Download failed automatically. Please check the file browser on the left.")

Model saved to ./fine_tuned_mbart_urdu
  adding: fine_tuned_mbart_urdu/ (stored 0%)
  adding: fine_tuned_mbart_urdu/tokenizer.json (deflated 76%)
  adding: fine_tuned_mbart_urdu/special_tokens_map.json (deflated 61%)
  adding: fine_tuned_mbart_urdu/model.safetensors (deflated 7%)
  adding: fine_tuned_mbart_urdu/tokenizer_config.json (deflated 92%)
  adding: fine_tuned_mbart_urdu/generation_config.json (deflated 43%)
  adding: fine_tuned_mbart_urdu/config.json (deflated 60%)
  adding: fine_tuned_mbart_urdu/sentencepiece.bpe.model (deflated 49%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp mbart_urdu_covid.zip /content/drive/MyDrive/
