In [None]:
# ================================
# IMPORTS
# ================================
import os
import time
import datetime
import random
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

from datasets import load_dataset, Dataset, Audio, DatasetDict, concatenate_datasets
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
)
from peft import get_peft_model, LoraConfig
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
import evaluate
from jiwer import wer as jiwer_wer
from huggingface_hub import login


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ================================
# CONFIGURATION
# ================================

# Experiment settings
EXPERIMENT_NAME = "finetuning-15A"
RANDOM_SEED = 42

# Model and LoRA config
BASE_MODEL_NAME = "openai/whisper-large-v3" # "openai/whisper-large-v3-turbo" # "openai/whisper-large-v3"
LORA_R = 16 #32
LORA_ALPHA = 32 #64
LORA_DROPOUT = 0.05
LORA_TARGET_MODULES = ["q_proj", "v_proj"] #, "k_proj", "out_proj"]

# Training config
LEARNING_RATE = 1e-4
BATCH_SIZE = 8
NUM_EPOCHS = 2 #12
FP16 = True
MAX_LABEL_LENGTH = 128

# Dataset config
TARGET_SR = 16000
AUDIO_COL = "audio"
TEXT_COL = "transcription"
TRAIN_NUM_SAMPLES = 4000  # None = full set
TEST_NUM_SAMPLES = None   # None = full set
EVAL_FROM_TRAIN_PCT = 0.05  # 0.05 = 5% validation from train

# Output files
PREDICTIONS_CSV = f"{EXPERIMENT_NAME}_predictions.csv"
SUMMARY_CSV = f"{EXPERIMENT_NAME}_summary.csv"

# Set random seeds
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)


In [None]:
# Pick from: "commonvoice", "fleurs", "csalt" or None
# At minimum, train_1 and test_1 must be non-None
train_1 = "csalt"
train_2 = "fleurs"
train_3 = None

test_1  = "commonvoice"
test_2  = None
test_3  = None


# helper functions

In [None]:
# ================================
# TEXT NORMALIZATION
# ================================
import re
import unicodedata
from typing import List, Callable
from itertools import product

# -----------------------------
# Core normalization utilities
# -----------------------------

_ARABIC_DIACRITICS = re.compile(
    "["                             # Arabic diacritics range
    "\u0610-\u061A"                 # honorifics, small high
    "\u064B-\u065F"                 # tanwin/harakat
    "\u0670"                        # superscript alef
    "\u06D6-\u06ED"                 # Quranic marks
    "]"
)

# Zero-width & elongation
_ZW_CHARS = re.compile("[\u200B-\u200F\u202A-\u202E\u2066-\u2069]")
_KASHIDA  = re.compile("\u0640")  # tatweel

# Arabic presentation forms (NFKC will canonicalize most)
def _compat_normalize(s: str) -> str:
    # Normalize compatibility forms and spacing
    s = unicodedata.normalize("NFKC", s)
    # Remove bidi/zero-width and kashida
    s = _ZW_CHARS.sub("", s)
    s = _KASHIDA.sub("", s)
    # Remove diacritics
    s = _ARABIC_DIACRITICS.sub("", s)
    return s

# Map Arabic/Urdu codepoints to a single canonical set often used in Urdu
# (Farsi Yeh, Heh goal, etc.)
def _canonical_codepoints(s: str) -> str:
    # Unify Yeh forms: U+064A (Arabic Yeh), U+06CC (Farsi Yeh) -> choose U+06CC
    s = s.replace("\u064A", "\u06CC")
    # Unify Alef Maksura (rare in Urdu) to Farsi Yeh as well (defensive)
    s = s.replace("\u0649", "\u06CC")
    # Unify Heh goal variants: ة/ه/ہ/ۂ → ہ (U+06C1) when appropriate
    # Keep it simple/robust for scoring:
    s = s.replace("\u06C0", "\u06C1")  # heh with hamza above → heh goal
    # Don't over-aggressively rewrite 'ه' to 'ہ' (Arabic heh to Urdu heh goal),
    # but we can do a light pass:
    s = re.sub(r"(?<=\S)\u0647(?=\b)", "\u06C1", s)  # word-final Arabic heh → Urdu heh goal
    return s

# Digits: normalize both Latin and Arabic-Indic to Arabic-Indic (or remove)
_ARABIC_INDIC_DIGITS = str.maketrans(
    "0123456789"
    "٠١٢٣٤٥٦٧٨٩"
    "۰۱۲۳۴۵۶۷۸۹",
    "۰۱۲۳۴۵۶۷۸۹" * 3  # map Latin + Arabic-Indic + Extended to Extended Arabic-Indic
)
def _normalize_digits(s: str) -> str:
    return s.translate(_ARABIC_INDIC_DIGITS)

# Remove punctuation & special markers (keep intra-word apostrophes if you want)
_PUNCT = re.compile(r"[^\w\s\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]")  # drop non-Arabic/word chars
# Seamless-style disfluencies: remove tokens like #um #uh #laugh
_SEAMLESS_DISFL = re.compile(r"(?<!\w)#\w+")

def _strip_punct_and_disfluencies(s: str) -> str:
    s = _SEAMLESS_DISFL.sub(" ", s)
    # Convert underscores/odd joins to space first (defensive)
    s = s.replace("_", " ")
    s = _PUNCT.sub(" ", s)
    return s

def _squash_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()


# -------------------------------------------
# Orthographic + token-segmentation variants
# -------------------------------------------

# Frequent variants noted in paper: "چاہیے" spellings; "ہو گا/ہوگا" etc.
_VARIANT_CANON = [
    # --- چاہیے (imperative/necessity) canonicalization ---
    # Variants: چاہیئے / چاھیے / چاہئے / چاہیۓ, etc → چاہیے
    (re.compile(r"\bچاہی[ئےۓ]\b"), "چاہیے"),
    (re.compile(r"\bچاھی[ئےۓ]\b"), "چاہیے"),
    (re.compile(r"\bچاہ[ئےۓ]\b"), "چاہیے"),
    # common stem "chahie" unvoweled variants
    (re.compile(r"\bچاہی?ے\b"), "چاہیے"),

    # --- ہوگا family: space-insensitive joining ---
    (re.compile(r"\bہو\s+گا\b"), "ہوگا"),
    (re.compile(r"\bہو\s+گی\b"), "ہوگی"),
    (re.compile(r"\bہو\s+گے\b"), "ہوگے"),
    # The reverse (split) hardly needed if we canonicalize to joined forms

    # Misc. common merges/splits seen in practice (add as you observe)
    (re.compile(r"\bکو ئی\b"), "کوئی"),
    (re.compile(r"\bکہ\b"), "کہ"),  # noop example; placeholders for future
]

def _apply_variant_canon(s: str) -> str:
    for pat, rep in _VARIANT_CANON:
        s = pat.sub(rep, s)
    return s


# -----------------------------
# Public normalizer
# -----------------------------
def normalize_urdu_text(text: str) -> str:
    """
    Robust normalizer for Urdu ASR scoring:
    - Unicode compatibility & diacritics removal
    - Canonical Urdu codepoints (Yeh/Heh goal)
    - Remove Seamless-style '#um' disfluencies
    - Remove punctuation
    - Normalize digits (Latin/Arabic to Eastern Arabic-Indic)
    - Canonicalize frequent orthographic variants (چاہیے, ہوگا~ہو گا)
    - Space squashing
    """
    if not text:
        return ""

    s = text

    # 1) Unicode & presentation forms → canonical, drop tatweel/ZW & diacritics
    s = _compat_normalize(s)

    # 2) Canonical Urdu codepoints
    s = _canonical_codepoints(s)

    # 3) Disfluencies + punctuation
    s = _strip_punct_and_disfluencies(s)

    # 4) Digits (optional; or drop all digits if your refs omit numbers)
    s = _normalize_digits(s)

    # 5) Orthographic canonicalizations & token segmentation fixes
    s = _apply_variant_canon(s)

    # 6) Collapse spaces
    s = _squash_spaces(s)

    return s


# ---------------------------------------------------------
# Optional: "lenient" comparison for WER with variants
# ---------------------------------------------------------

# Define lightweight variant generators for lattice expansion on very frequent cases.
# Keep these sets tight to avoid combinatorial blow-up.
_VARIANT_RULES = {
    "چاہیے": {"چاہیے", "چاہئے", "چاہیئے", "چاھیے", "چاہیۓ"},
    "ہوگا": {"ہوگا", "ہو گا"},
    "ہوگی": {"ہوگی", "ہو گی"},
    "ہوگے": {"ہوگے", "ہو گے"},
}

def _expand_variants(tokens: List[str]) -> List[List[str]]:
    expanded_per_token = []
    for tok in tokens:
        expanded_per_token.append(list(_VARIANT_RULES.get(tok, {tok})))
    # Cartesian product over tokens to build candidate sequences
    return [list(prod) for prod in product(*expanded_per_token)]

def generate_lenient_variants(s: str) -> List[str]:
    """
    Given a normalized string, produce a small set of alternative strings
    accounting for the most common spelling/spacing variants.
    """
    toks = s.split()
    seqs = _expand_variants(toks)
    return [" ".join(seq) for seq in seqs]

# Example of usage with jiwer:
# from jiwer import wer as jiwer_wer
# def lenient_min_wer(ref: str, hyp: str, normalizer: Callable[[str], str] = normalize_urdu_text) -> float:
#     r = normalizer(ref)
#     h = normalizer(hyp)
#     r_cands = generate_lenient_variants(r)
#     h_cands = generate_lenient_variants(h)
#     # Compute min WER across small lattice of variants
#     scores = []
#     for rc in r_cands:
#         for hc in h_cands:
#             scores.append(jiwer_wer(rc, hc))
#     return min(scores) if scores else jiwer_wer(r, h)


print("✅ Text normalization function loaded")


✅ Text normalization function loaded


In [None]:
# ================================
# SETUP
# ================================

overall_start_time = time.time()
print(f"🕐 Experiment started: {datetime.datetime.fromtimestamp(overall_start_time).strftime('%Y-%m-%d %H:%M:%S')}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")
if torch.cuda.is_available():
    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
    print(f"✅ Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Login to HuggingFace
login(token="HF_TOKEN")


🕐 Experiment started: 2025-10-31 10:37:27
✅ Using device: cuda
✅ GPU: NVIDIA A40
✅ Available GPU memory: 48.31 GB


# data laoding

In [6]:
# ================================
# DATA LOADING HELPERS
# ================================

def ensure_audio_and_text(ds, text_keys=("transcription", "sentence", "text", "label")):
    """Standardize column names to 'audio' and 'transcription'"""
    # Ensure TEXT_COL
    if TEXT_COL not in ds.column_names:
        for k in text_keys:
            if k in ds.column_names:
                ds = ds.rename_column(k, TEXT_COL)
                break
    if TEXT_COL not in ds.column_names:
        raise ValueError("Could not find transcript column")

    # Ensure AUDIO_COL and cast
    if AUDIO_COL not in ds.column_names:
        cand = next((c for c in ds.column_names if c.lower() in ("audio", "path", "file")), None)
        if cand:
            ds = ds.rename_column(cand, AUDIO_COL)
    
    # Always cast audio to ensure consistent sampling rate and format
    ds = ds.cast_column(AUDIO_COL, Audio(sampling_rate=TARGET_SR, mono=True, decode=True))
    
    return ds

def subsample_after_shuffle(ds, n, seed=RANDOM_SEED):
    """Shuffle and subsample dataset"""
    if n is None or n <= 0 or n >= len(ds):
        return ds
    return ds.shuffle(seed=seed).select(range(n))

def load_csalt_raw():
    ds_all = load_dataset("urdu-asr/csalt-voice", token=False)
    train_like = ensure_audio_and_text(ds_all["validation"])
    return DatasetDict({"train": train_like})

def load_fleurs_raw():
    """Load FLEURS Urdu (ur_pk + ur_in + ur) and merge all splits"""
    all_langs = []
    for lang_code in ["ur_pk", "ur_in", "ur"]:
        try:
            dataset = load_dataset("google/fleurs", lang_code, trust_remote_code=True)
            all_langs.append(dataset)
            print(f"✅ Loaded FLEURS split for {lang_code} with splits: {list(dataset.keys())}")
        except Exception:
            print(f"⚠️ Could not load FLEURS language code: {lang_code}")
            continue

    if not all_langs:
        raise ValueError("Could not load any FLEURS Urdu variants")

    # Merge all language variants together
    from datasets import DatasetDict, concatenate_datasets

    merged = {}
    for split in ["train", "validation", "test"]:
        merged_splits = [
            ensure_audio_and_text(ds[split]) for ds in all_langs if split in ds
        ]
        if merged_splits:
            merged[split] = concatenate_datasets(merged_splits)

    print(f"✅ Combined FLEURS Urdu splits: {', '.join(merged.keys())}")
    return DatasetDict(merged)


def load_commonvoice_raw():
    ds_all = load_dataset("mozilla-foundation/common_voice_17_0", "ur", trust_remote_code=True, token=True)
    dd = {}
    for split in ["train", "validation", "test"]:
        if split in ds_all:
            ds = ds_all[split]
            if "sentence" in ds.column_names:
                ds = ds.rename_column("sentence", TEXT_COL)
            dd[split] = ensure_audio_and_text(ds)
    return DatasetDict(dd)


In [None]:
# ================================
# LOAD AND PREPARE DATASETS (DYNAMIC)
# ================================

print("\n" + "="*50)
print("📊 LOADING DATASETS (dynamic)")
print("="*50)

# 1) Load raw DatasetDicts (unchanged)
print("Loading CommonVoice...")
commonvoice = load_commonvoice_raw()

print("Loading FLEURS...")
fleurs = load_fleurs_raw()

print("Loading CSaLT...")
csalt = load_csalt_raw()

from datasets import concatenate_datasets

def merge_all_splits(ds_dict):
    """
    Concatenate all available splits from a DatasetDict.
    This mirrors your previous logic (train+validation+test).
    """
    available = [ds_dict[s] for s in ["train", "validation", "test"] if s in ds_dict]
    if not available:
        raise ValueError("No splits found to merge in provided DatasetDict.")
    return concatenate_datasets(available)

def safe_select_columns(ds, wanted_cols):
    """
    Select only the columns that actually exist to avoid KeyError
    if a source is missing one. (Typically both AUDIO_COL and TEXT_COL exist.)
    """
    keep = [c for c in wanted_cols if c in ds.column_names]
    if not keep:
        raise ValueError(
            f"None of the requested columns {wanted_cols} are present in {ds.column_names}"
        )
    return ds.select_columns(keep)

# 2) Build a prepared (merged + column-selected) registry for each dataset name
prepared_registry = {
    "commonvoice": safe_select_columns(merge_all_splits(commonvoice), [AUDIO_COL, TEXT_COL]),
    "fleurs":      safe_select_columns(merge_all_splits(fleurs),      [AUDIO_COL, TEXT_COL]),
    "csalt":       safe_select_columns(merge_all_splits(csalt),       [AUDIO_COL, TEXT_COL]),
}

# 3) Helpers to resolve user choices into a list of prepared datasets
def resolve_choice(name: str | None):
    if name is None:
        return None
    key = name.strip().lower()
    if key not in prepared_registry:
        valid = ", ".join(sorted(prepared_registry.keys()))
        raise ValueError(f"Unknown dataset '{name}'. Valid options: {valid} or None.")
    return prepared_registry[key]

def build_pool(*names):
    """
    Given up to three names/None, return a concatenated dataset
    of all non-None selections. Requires at least one non-None.
    """
    selected = [resolve_choice(n) for n in names if n is not None]
    if not selected:
        raise ValueError("At least one dataset must be selected to build a pool.")
    if len(selected) == 1:
        return selected[0]
    return concatenate_datasets(selected)

# 4) Resolve TRAIN and TEST pools from the six choices
#    (Shuffle + optional subsample mirrors your original behavior)
print("\n" + "-"*50)
print("🧩 Building TRAIN pool from user choices...")
train_pool = build_pool(train_1, train_2, train_3).shuffle(seed=RANDOM_SEED)

# Optional subsampling (disabled if TRAIN_NUM_SAMPLES=None)
train_ds = subsample_after_shuffle(train_pool, TRAIN_NUM_SAMPLES, seed=RANDOM_SEED)

# Optional: carve validation from train (unchanged)
validation_ds = None
if EVAL_FROM_TRAIN_PCT > 0.0:
    n_eval = int(len(train_ds) * EVAL_FROM_TRAIN_PCT)
    if n_eval > 0:
        validation_ds = train_ds.select(range(n_eval))
        train_ds = train_ds.select(range(n_eval, len(train_ds)))
        print(f"✅ Validation carved from train: {len(validation_ds)}")

print("\n" + "-"*50)
print("🧪 Building TEST pool from user choices...")
test_pool = build_pool(test_1, test_2, test_3).shuffle(seed=RANDOM_SEED)

# Optional subsampling for test (same helper you already have)
test_ds = subsample_after_shuffle(test_pool, TEST_NUM_SAMPLES, seed=RANDOM_SEED)

# 5) Summaries
def _fmt(x): return x if x is not None else "-"
print("\n" + "="*50)
print("✅ FINAL DATASET SIZES")
print("="*50)
print(f"Train set: {len(train_ds)} samples")
if validation_ds is not None:
    print(f"Validation set: {len(validation_ds)} samples")
print(f"Test set:  {len(test_ds)} samples")

print("\n" + "="*50)
print("📝 DATASET SOURCES (for this run)")
print("="*50)
print(f"train_1: {_fmt(train_1)} | train_2: {_fmt(train_2)} | train_3: {_fmt(train_3)}")
print(f"test_1:  {_fmt(test_1)}  | test_2:  {_fmt(test_2)}  | test_3:  {_fmt(test_3)}")



📊 LOADING DATASETS (dynamic)
Loading CommonVoice...


Using the latest cached version of the module from C:\Users\shaider\.cache\huggingface\modules\datasets_modules\datasets\mozilla-foundation--common_voice_17_0\9d10386a731ff6e6ed4ec973a4dc204a9820e8c842fbe388bdba0dd205ed5016 (last modified on Mon Oct 13 16:31:58 2025) since it couldn't be found locally at mozilla-foundation/common_voice_17_0, or remotely on the Hugging Face Hub.


Loading FLEURS...
✅ Loaded FLEURS split for ur_pk with splits: ['train', 'validation', 'test']
⚠️ Could not load FLEURS language code: ur_in
⚠️ Could not load FLEURS language code: ur
✅ Combined FLEURS Urdu splits: train, validation, test
Loading CSaLT...

--------------------------------------------------
🧩 Building TRAIN pool from user choices...
✅ Validation carved from train: 157

--------------------------------------------------
🧪 Building TEST pool from user choices...

✅ FINAL DATASET SIZES
Train set: 2989 samples
Validation set: 157 samples
Test set:  13481 samples

📝 DATASET SOURCES (for this run)
train_1: csalt | train_2: fleurs | train_3: -
test_1:  commonvoice  | test_2:  -  | test_3:  -


# model

In [8]:
# ================================
# MODEL SETUP
# ================================

print("\n" + "="*50)
print("🔧 MODEL SETUP")
print("="*50)

# Load processor
processor = WhisperProcessor.from_pretrained(BASE_MODEL_NAME)
tokenizer = processor.tokenizer
feature_extractor = processor.feature_extractor
tokenizer.pad_token = tokenizer.eos_token

print(f"✅ Loaded processor from {BASE_MODEL_NAME}")

# Load base model
print(f"Loading model in {'FP16' if FP16 else 'FP32'} precision...")
model = WhisperForConditionalGeneration.from_pretrained(
    BASE_MODEL_NAME,
    torch_dtype=torch.float16 if FP16 else torch.float32
)

# ✅ Force Urdu-only transcription mode (no English translation)
model.config.forced_decoder_ids = None
model.generation_config.forced_decoder_ids = None
model.config.language = "ur"
model.config.task = "transcribe"
model.generation_config.language = "ur"
model.generation_config.task = "transcribe"

print("✅ Configured model for Urdu transcription only (no English translation)")

# Apply LoRA
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    target_modules=LORA_TARGET_MODULES
)

model = get_peft_model(model, lora_config)
model.forward = model.base_model.forward

print("\n📊 Trainable Parameters:")
model.print_trainable_parameters()

model = model.to(device)



🔧 MODEL SETUP


`torch_dtype` is deprecated! Use `dtype` instead!


✅ Loaded processor from openai/whisper-large-v3
Loading model in FP16 precision...
✅ Configured model for Urdu transcription only (no English translation)

📊 Trainable Parameters:
trainable params: 7,864,320 || all params: 1,551,354,880 || trainable%: 0.5069


In [None]:
# ================================
# DATA PREPROCESSING
# ================================

print("\n" + "="*50)
print("🔄 PREPROCESSING DATA")
print("="*50)

def prepare_dataset(batch):
    """Preprocess audio and text for Whisper"""
    audio = batch[AUDIO_COL]
    
    # Process audio
    inputs = processor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt"
    )
    batch["input_features"] = inputs.input_features[0]
    
    # Process text
    tokenized = tokenizer(
        batch[TEXT_COL],
        padding="max_length",
        max_length=MAX_LABEL_LENGTH,
        truncation=True,
        return_tensors="pt"
    )
    batch["labels"] = tokenized.input_ids[0]
    
    return batch

# Preprocess datasets
train_ds = train_ds.map(
    prepare_dataset,
    remove_columns=train_ds.column_names,
    desc="Preprocessing train set"
)

if validation_ds:
    validation_ds = validation_ds.map(
        prepare_dataset,
        remove_columns=validation_ds.column_names,
        desc="Preprocessing validation set"
    )

test_ds = test_ds.map(
    prepare_dataset,
    remove_columns=test_ds.column_names,
    desc="Preprocessing test set"
)

print(f"✅ Preprocessing complete")



🔄 PREPROCESSING DATA


Preprocessing train set: 100%|██████████| 2989/2989 [01:55<00:00, 25.87 examples/s]
Preprocessing validation set: 100%|██████████| 157/157 [00:06<00:00, 24.55 examples/s]
Preprocessing test set: 100%|██████████| 13481/13481 [11:36<00:00, 19.36 examples/s]  

✅ Preprocessing complete





In [None]:
# ================================
# PRE-TRAINING EVALUATION
# ================================

print("\n" + "="*50)
print("🔍 PRE-TRAINING WER EVALUATION")
print("="*50)

def evaluate_model(model, test_dataset, device, desc="Evaluating"):
    """Evaluate model and return WER metrics"""
    model.eval()
    
    predictions = []
    references = []
    predictions_raw = []  # Store raw predictions for debugging
    references_raw = []   # Store raw references for debugging
    
    with torch.no_grad():
        for sample in tqdm(test_dataset, desc=desc):
            input_features = torch.tensor(sample["input_features"]).unsqueeze(0).to(device)
            
            if FP16:
                input_features = input_features.half()
            
            pred_ids = model.generate(input_features=input_features)
            pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0].strip()
            
            # Get reference from the preprocessed labels
            label_ids = sample["labels"]
            # Remove padding tokens
            label_ids = [id for id in label_ids if id != tokenizer.pad_token_id]
            label_str = tokenizer.decode(label_ids, skip_special_tokens=True).strip()
            
            # Store raw versions
            predictions_raw.append(pred_str)
            references_raw.append(label_str)
            
            # *** APPLY TEXT NORMALIZATION HERE ***
            pred_str_normalized = normalize_urdu_text(pred_str)
            label_str_normalized = normalize_urdu_text(label_str)
            
            predictions.append(pred_str_normalized)
            references.append(label_str_normalized)
    
    # Calculate WER on normalized text
    sample_wers = [jiwer_wer(ref, pred) for ref, pred in zip(references, predictions)]
    overall_wer = np.mean(sample_wers)
    
    return {
        "predictions": predictions,
        "references": references,
        "predictions_raw": predictions_raw,  # Include raw for debugging
        "references_raw": references_raw,
        "sample_wers": sample_wers,
        "overall_wer": overall_wer
    }


# Evaluate before fine-tuning
pre_results = evaluate_model(model, test_ds, device, desc="Pre-training evaluation")
pre_training_wer = pre_results["overall_wer"]

print(f"\n📊 PRE-TRAINING WER: {pre_training_wer:.4f} ({pre_training_wer*100:.2f}%)")

# Optional: Show some examples to verify normalization is working
print("\n🔍 Sample Normalization Examples:")
for i in range(min(3, len(pre_results["predictions"]))):
    print(f"\nExample {i+1}:")
    print(f"  Raw Reference:  {pre_results['references_raw'][i][:100]}")
    print(f"  Norm Reference: {pre_results['references'][i][:100]}")
    print(f"  Raw Prediction: {pre_results['predictions_raw'][i][:100]}")
    print(f"  Norm Prediction: {pre_results['predictions'][i][:100]}")



🔍 PRE-TRAINING WER EVALUATION


Pre-training evaluation:   0%|          | 0/13481 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Pre-training evaluation: 100%|██████████| 13481/13481 [7:54:18<00:00,  2.11s/it]  



📊 PRE-TRAINING WER: 0.3040 (30.40%)

🔍 Sample Normalization Examples:

Example 1:
  Raw Reference:  ضد کی ہے اور بات مگر خو بری نہیں
  Norm Reference: ضد کی ہے اور بات مگر خو بری نہیں
  Raw Prediction: زد کی ہے اور بات مگر خوبری نہیں
  Norm Prediction: زد کی ہے اور بات مگر خوبری نہیں

Example 2:
  Raw Reference:  نشست سے نکالا گیا
  Norm Reference: نشست سے نکالا گیا
  Raw Prediction: نشست سے نکالا گیا
  Norm Prediction: نشست سے نکالا گیا

Example 3:
  Raw Reference:  عامر لیاقت نامناسب تصویر پوسٹ کرنے پر تنازع کی زد میں
  Norm Reference: عامر لیاقت نامناسب تصویر پوسٹ کرنے پر تنازع کی زد میں
  Raw Prediction: آمد لیاقت نامناسب تصویر پوسٹ کرنے پر تنازع کی زد میں
  Norm Prediction: آمد لیاقت نامناسب تصویر پوسٹ کرنے پر تنازع کی زد میں


In [11]:
# ================================
# TRAINING SETUP
# ================================

print("\n" + "="*50)
print("🏋️ TRAINING SETUP")
print("="*50)

def collate_fn(batch):
    """Collate function for DataLoader"""
    input_feats = torch.stack([
        torch.tensor(item["input_features"], dtype=torch.float32)
        for item in batch
    ])
    
    label_tensors = pad_sequence(
        [torch.tensor(item["labels"], dtype=torch.long) for item in batch],
        batch_first=True,
        padding_value=tokenizer.pad_token_id
    )

    return {
        "input_features": input_feats,
        "labels": label_tensors
    }

# Create DataLoader
train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    pin_memory=True,
    num_workers=0,
    collate_fn=collate_fn
)

# Setup optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Setup gradient scaler for FP16
scaler = torch.cuda.amp.GradScaler() if FP16 and torch.cuda.is_available() else None

print(f"✅ Optimizer: AdamW (lr={LEARNING_RATE})")
print(f"✅ Batch size: {BATCH_SIZE}")
print(f"✅ Total batches per epoch: {len(train_loader)}")
print(f"✅ Mixed precision (FP16): {FP16}")



🏋️ TRAINING SETUP
✅ Optimizer: AdamW (lr=0.0001)
✅ Batch size: 8
✅ Total batches per epoch: 374
✅ Mixed precision (FP16): True


  scaler = torch.cuda.amp.GradScaler() if FP16 and torch.cuda.is_available() else None


## training

In [None]:
# ================================
# TRAINING
# ================================

print("\n" + "="*50)
print("🚀 STARTING TRAINING")
print("="*50)

train_start_time = time.time()
model.train()

for epoch in range(NUM_EPOCHS):
    total_loss = 0.0
    print(f"\n🎯 Epoch {epoch+1}/{NUM_EPOCHS}")
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        try:
            input_feats = batch["input_features"].to(device)
            labels = batch["labels"].to(device)
            
            # Handle FP16 training
            if FP16 and scaler is not None:
                with torch.cuda.amp.autocast():
                    outputs = model(input_features=input_feats, labels=labels)
                    loss = outputs.loss
                
                optimizer.zero_grad()
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                if FP16:
                    input_feats = input_feats.half()
                
                outputs = model(input_features=input_feats, labels=labels)
                loss = outputs.loss
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            total_loss += loss.item()
            
        except Exception as e:
            print(f"⚠️ Error processing batch: {e}")
            continue
    
    avg_loss = total_loss / len(train_loader)
    print(f"✅ Epoch {epoch+1} complete — Avg Loss: {avg_loss:.4f}")
    
    # Validation if available
    if validation_ds:
        
        val_results = evaluate_model(model, validation_ds, device, desc="Validation")
        print(f"🔎 Validation WER: {val_results['overall_wer']:.4f}")
        model.train()  # Back to training mode

train_end_time = time.time()
train_duration_secs = int(train_end_time - train_start_time)
train_duration_hms = str(datetime.timedelta(seconds=train_duration_secs))

print(f"\n✅ Training complete! Duration: {train_duration_hms}")



🚀 STARTING TRAINING

🎯 Epoch 1/2


  with torch.cuda.amp.autocast():
Epoch 1: 100%|██████████| 374/374 [22:25<00:00,  3.60s/it]


✅ Epoch 1 complete — Avg Loss: 0.3214


Validation: 100%|██████████| 157/157 [15:32<00:00,  5.94s/it]


🔎 Validation WER: 0.2313

🎯 Epoch 2/2


Epoch 2: 100%|██████████| 374/374 [22:20<00:00,  3.58s/it]


✅ Epoch 2 complete — Avg Loss: 0.1652


Validation: 100%|██████████| 157/157 [15:22<00:00,  5.88s/it]

🔎 Validation WER: 0.2237

✅ Training complete! Duration: 1:15:41





## eval

In [None]:
# ================================
# POST-TRAINING EVALUATION
# ================================

print("\n" + "="*50)
print("📏 POST-TRAINING WER EVALUATION")
print("="*50)


# Evaluate after fine-tuning
post_results = evaluate_model(model, test_ds, device, desc="Post-training evaluation")
post_training_wer = post_results["overall_wer"]

print(f"\n📊 POST-TRAINING WER: {post_training_wer:.4f} ({post_training_wer*100:.2f}%)")

# Calculate improvement
wer_improvement = pre_training_wer - post_training_wer
wer_improvement_pct = (wer_improvement / pre_training_wer) * 100

print(f"\n🎉 WER IMPROVEMENT: {wer_improvement:.4f} ({wer_improvement_pct:.2f}%)")
print(f"   Pre-training:  {pre_training_wer:.4f}")
print(f"   Post-training: {post_training_wer:.4f}")



📏 POST-TRAINING WER EVALUATION


Post-training evaluation: 100%|██████████| 13481/13481 [8:09:51<00:00,  2.18s/it]  



📊 POST-TRAINING WER: 0.3018 (30.18%)

🎉 WER IMPROVEMENT: 0.0022 (0.72%)
   Pre-training:  0.3040
   Post-training: 0.3018


In [None]:
# ================================
# SAVE RESULTS
# ================================

print("\n" + "="*50)
print("💾 SAVING RESULTS")
print("="*50)

overall_end_time = time.time()
overall_duration_secs = int(overall_end_time - overall_start_time)
overall_duration_hms = str(datetime.timedelta(seconds=overall_duration_secs))

# Save sample-wise predictions with both raw and normalized versions
samplewise_data = []
for i in range(len(post_results["predictions"])):
    samplewise_data.append({
        "reference_raw": post_results["references_raw"][i],
        "reference_normalized": post_results["references"][i],
        "prediction_raw": post_results["predictions_raw"][i],
        "prediction_normalized": post_results["predictions"][i],
        "wer": round(post_results["sample_wers"][i], 4)
    })

CSV_OUTPUT_DIR = f"./experiments/{EXPERIMENT_NAME}"
# Create directories
os.makedirs(CSV_OUTPUT_DIR, exist_ok=True)
print(f"📁 Created directory: {CSV_OUTPUT_DIR}")

NEW_PREDICTIONS_CSV = f"{CSV_OUTPUT_DIR}/{PREDICTIONS_CSV}"
pd.DataFrame(samplewise_data).to_csv(NEW_PREDICTIONS_CSV, index=False)
print(f"📄 Saved predictions: {NEW_PREDICTIONS_CSV}")


# Save run summary
summary_data = {
    "experiment_name": EXPERIMENT_NAME,
    "base_model": BASE_MODEL_NAME,
    "lora_r": LORA_R,
    "lora_alpha": LORA_ALPHA,
    "lora_dropout": LORA_DROPOUT,
    "target_modules": str(LORA_TARGET_MODULES),
    "learning_rate": LEARNING_RATE,
    "batch_size": BATCH_SIZE,
    "num_train_epochs": NUM_EPOCHS,
    "train_1": train_1 if train_1 is not None else "-",
    "train_2": train_2 if train_2 is not None else "-",
    "train_3": train_3 if train_3 is not None else "-",
    "test_1":  test_1 if test_1 is not None else "-",
    "test_2":  test_2 if test_2 is not None else "-",
    "test_3":  test_3 if test_3 is not None else "-",
    "train_num_samples_cap": TRAIN_NUM_SAMPLES if TRAIN_NUM_SAMPLES else "full",
    "test_num_samples_cap": TEST_NUM_SAMPLES if TEST_NUM_SAMPLES else "full",
    "eval_from_train_pct": EVAL_FROM_TRAIN_PCT,
    "train_set_size": len(train_ds),
    "validation_set_size": len(validation_ds) if validation_ds else 0,
    "test_set_size": len(test_ds),
    "total_start_time": datetime.datetime.fromtimestamp(overall_start_time).strftime("%Y-%m-%d %H:%M:%S"),
    "total_end_time": datetime.datetime.fromtimestamp(overall_end_time).strftime("%Y-%m-%d %H:%M:%S"),
    "total_duration": overall_duration_hms,
    "train_start_time": datetime.datetime.fromtimestamp(train_start_time).strftime("%Y-%m-%d %H:%M:%S"),
    "train_end_time": datetime.datetime.fromtimestamp(train_end_time).strftime("%Y-%m-%d %H:%M:%S"),
    "train_duration": train_duration_hms,
    "fp16_enabled": FP16,
    "pre_training_wer": round(pre_training_wer, 4),
    "post_training_wer": round(post_training_wer, 4),
    "wer_improvement": round(wer_improvement, 4),
    "wer_improvement_percent": round(wer_improvement_pct, 2) if BASE_MODEL_NAME == "openai/whisper-large-v3-turbo" else '-',
    "wer_improvement_percent_large": round(wer_improvement_pct, 2) if BASE_MODEL_NAME == "openai/whisper-large-v3" else '-'
}

NEW_SUMMARY_CSV = f"{CSV_OUTPUT_DIR}/{SUMMARY_CSV}"
pd.DataFrame([summary_data]).to_csv(NEW_SUMMARY_CSV, index=False)
print(f"📄 Saved summary: {NEW_SUMMARY_CSV}")



💾 SAVING RESULTS
📁 Created directory: ./experiments/finetuning-15A
📄 Saved predictions: ./experiments/finetuning-15A/finetuning-15A_predictions.csv
📄 Saved summary: ./experiments/finetuning-15A/finetuning-15A_summary.csv


In [None]:
# ================================
# FINAL SUMMARY
# ================================

print("\n" + "="*50)
print("🎉 EXPERIMENT COMPLETE")
print("="*50)
print(f"Total duration: {overall_duration_hms}")
print(f"\n📊 Results:")
print(f"   Pre-training WER:  {pre_training_wer:.4f} ({pre_training_wer*100:.2f}%)")
print(f"   Post-training WER: {post_training_wer:.4f} ({post_training_wer*100:.2f}%)")
print(f"   Improvement:       {wer_improvement:.4f} ({wer_improvement_pct:.2f}%)")
print(f"\n📁 Output files:")
print(f"   - {PREDICTIONS_CSV}")
print(f"   - {SUMMARY_CSV}")
print("="*50)



🎉 EXPERIMENT COMPLETE
Total duration: 17:34:10

📊 Results:
   Pre-training WER:  0.3040 (30.40%)
   Post-training WER: 0.3018 (30.18%)
   Improvement:       0.0022 (0.72%)

📁 Output files:
   - finetuning-15A_predictions.csv
   - finetuning-15A_summary.csv


In [None]:
# ================================
# SAVE FINE-TUNED MODEL
# ================================

print("\n" + "="*50)
print("💾 SAVING FINE-TUNED MODEL")
print("="*50)

# Define output directory
OUTPUT_DIR = f"./saved_models/{EXPERIMENT_NAME}"
LORA_ADAPTER_DIR = f"{OUTPUT_DIR}/lora_adapter"
MERGED_MODEL_DIR = f"{OUTPUT_DIR}/merged_model"

# Create directories
os.makedirs(LORA_ADAPTER_DIR, exist_ok=True)
print(f"📁 Created directory: {LORA_ADAPTER_DIR}")

# 1. Save LoRA adapter weights (lightweight, recommended)
print("\n🔧 Saving LoRA adapter weights...")
model.save_pretrained(LORA_ADAPTER_DIR)
processor.save_pretrained(LORA_ADAPTER_DIR)
print(f"✅ LoRA adapter saved to: {LORA_ADAPTER_DIR}")

# 2. Save the merged model (optional, larger file size)
# Uncomment the following lines if you want to save the full merged model
"""
print("\n🔄 Merging LoRA weights with base model...")
os.makedirs(MERGED_MODEL_DIR, exist_ok=True)

# Merge and unload LoRA weights
merged_model = model.merge_and_unload()

# Save merged model
print("💾 Saving merged model...")
merged_model.save_pretrained(MERGED_MODEL_DIR)
processor.save_pretrained(MERGED_MODEL_DIR)
print(f"✅ Merged model saved to: {MERGED_MODEL_DIR}")
"""

# 3. Save configuration info
config_info = {
    "base_model": BASE_MODEL_NAME,
    "lora_r": LORA_R,
    "lora_alpha": LORA_ALPHA,
    "lora_dropout": LORA_DROPOUT,
    "target_modules": LORA_TARGET_MODULES,
    "training_epochs": NUM_EPOCHS,
    "learning_rate": LEARNING_RATE,
    "batch_size": BATCH_SIZE,
    "final_wer": round(post_training_wer, 4)
}

import json
with open(f"{LORA_ADAPTER_DIR}/training_config.json", "w") as f:
    json.dump(config_info, f, indent=2)
print(f"✅ Training config saved to: {LORA_ADAPTER_DIR}/training_config.json")

print("\n" + "="*50)
print("🎉 MODEL SAVING COMPLETE")
print("="*50)
print(f"\n📦 Saved files:")
print(f"   LoRA Adapter: {LORA_ADAPTER_DIR}")
print(f"   - adapter_model.safetensors (LoRA weights)")
print(f"   - adapter_config.json (LoRA configuration)")
print(f"   - preprocessor_config.json & tokenizer files")
print(f"   - training_config.json (your training settings)")

print(f"\n🔄 To load the model later, use:")
print(f"""
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from peft import PeftModel

# Load base model
base_model = WhisperForConditionalGeneration.from_pretrained("{BASE_MODEL_NAME}")

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "{LORA_ADAPTER_DIR}")

# Load processor
processor = WhisperProcessor.from_pretrained("{LORA_ADAPTER_DIR}")
""")



💾 SAVING FINE-TUNED MODEL
📁 Created directory: ./saved_models/finetuning-15A/lora_adapter

🔧 Saving LoRA adapter weights...
✅ LoRA adapter saved to: ./saved_models/finetuning-15A/lora_adapter
✅ Training config saved to: ./saved_models/finetuning-15A/lora_adapter/training_config.json

🎉 MODEL SAVING COMPLETE

📦 Saved files:
   LoRA Adapter: ./saved_models/finetuning-15A/lora_adapter
   - adapter_model.safetensors (LoRA weights)
   - adapter_config.json (LoRA configuration)
   - preprocessor_config.json & tokenizer files
   - training_config.json (your training settings)

🔄 To load the model later, use:

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from peft import PeftModel

# Load base model
base_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "./saved_models/finetuning-15A/lora_adapter")

# Load processor
processor = WhisperProcessor.from_pretrained("./s

# execute the next experiment
here we have prepared the next experiment and the code automatically runs after this experiment is completed to save time !

In [17]:
import papermill as pm

pm.execute_notebook(
    'finetuning-16A.ipynb',          # input notebook
    'finetuning-16A-run.ipynb',   # output notebook (with executed cells)
    kernel_name='python3',         # which kernel to use
    progress_bar=True,             # show progress bar
    log_output=True                # show cell outputs live
)


  from .autonotebook import tqdm as notebook_tqdm

Executing:  30%|███       | 7/23 [01:11<01:22,  5.18s/cell]Using the latest cached version of the module from C:\Users\shaider\.cache\huggingface\modules\datasets_modules\datasets\mozilla-foundation--common_voice_17_0\9d10386a731ff6e6ed4ec973a4dc204a9820e8c842fbe388bdba0dd205ed5016 (last modified on Mon Oct 13 16:31:58 2025) since it couldn't be found locally at mozilla-foundation/common_voice_17_0, or remotely on the Hugging Face Hub.

Executing:  39%|███▉      | 9/23 [01:31<01:37,  6.95s/cell]`torch_dtype` is deprecated! Use `dtype` instead!

Preprocessing train set:   0%|          | 0/3800 [00:00<?, ? examples/s]
Preprocessing train set:   0%|          | 1/3800 [00:04<4:24:41,  4.18s/ examples]
Preprocessing train set:   0%|          | 6/3800 [00:04<34:13,  1.85 examples/s]  
Preprocessing train set:   0%|          | 11/3800 [00:04<16:01,  3.94 examples/s]
Preprocessing train set:   0%|          | 16/3800 [00:04<09:35,  6.58 example

{'cells': [{'cell_type': 'code',
   'execution_count': 1,
   'id': '6597c63b',
   'metadata': {'tags': [],
    'papermill': {'exception': False,
     'start_time': '2025-10-31T23:11:47.007303',
     'end_time': '2025-10-31T23:12:50.119295',
     'duration': 63.111992,
     'status': 'completed'},
    'execution': {'iopub.status.busy': '2025-10-31T23:11:47.048304Z',
     'iopub.execute_input': '2025-10-31T23:11:47.048304Z',
     'iopub.status.idle': '2025-10-31T23:12:50.116243Z',
     'shell.execute_reply': '2025-10-31T23:12:50.116243Z'}},
   'outputs': [{'output_type': 'stream',
     'name': 'stderr',
  {'cell_type': 'code',
   'execution_count': 2,
   'id': '2ffcb618',
   'metadata': {'tags': [],
    'papermill': {'exception': False,
     'start_time': '2025-10-31T23:12:50.149298',
     'end_time': '2025-10-31T23:12:50.679421',
     'duration': 0.530123,
     'status': 'completed'},
    'execution': {'iopub.status.busy': '2025-10-31T23:12:50.177297Z',
     'iopub.execute_input': '2025