# Advanced Bengali Regional ASR - Full Fine-tuning
### FIXED Version: No Trimming + TRUE Per-Epoch Augmentation + Weighted Sampling

In [None]:
!apt-get install -y libsndfile1
!pip install numpy scipy librosa soundfile
!pip install audiomentations --no-build-isolation

In [None]:
!pip install transformers==4.47.0 datasets accelerate evaluate jiwer tensorboard librosa soundfile audiomentations -q

In [None]:
!pip install evaluate

## Setup and Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')



In [None]:
# Fix protobuf compatibility issues
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

# Suppress CUDA warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
!pip uninstall -y protobuf
!pip install protobuf==3.20.3

In [None]:
import os
import pandas as pd
import numpy as np
import random
import torch
import librosa
from torch.utils.data import Dataset as TorchDataset
from datasets import Dataset, DatasetDict
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Audio augmentation
try:
    from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Gain
    USE_AUDIOMENTATIONS = True
    print("✓ audiomentations library loaded")
except ImportError:
    USE_AUDIOMENTATIONS = False
    print("⚠ audiomentations not available, using basic augmentation")

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Configuration

In [None]:
# Model and paths
MODEL_NAME = "bengaliAI/tugstugi_bengaliai-asr_whisper-medium"
TRAIN_AUDIO_PATH = "/kaggle/input/shobdotori-regspeech12-compact-v2/shobdotori_regspeech12_compact_v2/Train"
TRAIN_ANNOTATION_PATH = "/kaggle/input/shobdotori-regspeech12-compact-v2/shobdotori_regspeech12_compact_v2/Train_annotation"
OUTPUT_DIR = "./whisper-bengali-full-finetune-advanced"

# Training configuration (optimized for P100)
SEED = 42
CONFIG = {
    'batch_size': 1,              
    'gradient_accumulation': 4,   
    'learning_rate': 1e-5,        
    'weight_decay': 0.01,
    'warmup_steps': 500,
    'num_epochs': 6,
    'eval_steps': 500,            
    'save_steps': None,
    'save_total_limit': None,
    'early_stopping_patience': 3,
    'use_augmentation': True,
    'augmentation_prob': 0.5,
    'use_weighted_sampling': True,
    'fp16': True,                 
}

# Set random seeds
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

## Mild Augmentation Setup

In [None]:
# Audio augmentation pipeline with fallback
if USE_AUDIOMENTATIONS:
    # Use audiomentations library (best quality)
    augment = Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.01, p=0.5),
        TimeStretch(min_rate=0.9, max_rate=1.1, p=0.5),
        PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
        Gain(min_gain_db=-6, max_gain_db=6, p=0.3),
    ])
    
    def apply_augmentation(audio, sample_rate, probability=0.5):
        """Apply augmentation with audiomentations"""
        if np.random.random() < probability:
            return augment(samples=audio, sample_rate=sample_rate)
        return audio
else:
    # Fallback to basic numpy augmentation
    def apply_augmentation(audio, sample_rate, probability=0.5):
        """Apply basic augmentation using numpy"""
        if np.random.random() >= probability:
            return audio
        
        # Add Gaussian noise
        if np.random.random() < 0.5:
            noise = np.random.normal(0, 0.005, audio.shape)
            audio = audio + noise
        
        # Volume scaling
        if np.random.random() < 0.3:
            gain = np.random.uniform(0.7, 1.3)
            audio = audio * gain
        
        return np.clip(audio, -1.0, 1.0)

print("✓ Augmentation pipeline initialized")
print("  This will be applied ON-THE-FLY during training (fresh augmentations each epoch)")

## Load Regional Data

In [None]:
# Get all regional folders
regions = [d for d in os.listdir(TRAIN_AUDIO_PATH) if os.path.isdir(os.path.join(TRAIN_AUDIO_PATH, d))]
print(f"Found {len(regions)} regional dialects: {regions}")

## Audio Preprocessing (No Trimming)

In [None]:
# MODIFIED: Audio preprocessing function WITHOUT trimming logic
def process_audio_clips(source_directory, output_directory, 
                       padding_seconds=3.5):
    """
    Process audio clips: eliminate zero-length, pad short clips
    LONG CLIPS ARE KEPT AS-IS (no trimming)
    """
    from pathlib import Path
    import soundfile as sf
    
    Path(output_directory).mkdir(exist_ok=True)
    
    processed_count = 0
    eliminated_count = 0
    padded_count = 0
    kept_as_is_count = 0
    eliminated_files = []
    
    for root, dirs, files in os.walk(source_directory):
        for filename in files:
            if filename.lower().endswith('.wav'):
                filepath = os.path.join(root, filename)
                
                try:
                    # Get duration
                    duration = librosa.get_duration(path=filepath)
                    
                    # Eliminate zero or near-zero length clips
                    if duration < 1.00:
                        eliminated_files.append(filepath)
                        eliminated_count += 1
                        continue
                    
                    # Load audio for processing
                    audio, sr = librosa.load(filepath)
                    
                    # Process based on length
                    if duration < 10.0:  # Short clips get padding
                        padding_samples = int(padding_seconds * sr)
                        silence = np.zeros(padding_samples)
                        processed_audio = np.concatenate([silence, audio, silence])
                        padded_count += 1
                    else:
                        # Long clips are kept AS-IS (no trimming)
                        processed_audio = audio
                        kept_as_is_count += 1
                    
                    # Save processed audio
                    rel_path = os.path.relpath(filepath, source_directory)
                    output_path = os.path.join(output_directory, rel_path)
                    os.makedirs(os.path.dirname(output_path), exist_ok=True)
                    
                    sf.write(output_path, processed_audio, sr)
                    processed_count += 1
                    
                    if processed_count % 100 == 0:
                        print(f"Processed {processed_count} files...")
                        
                except Exception as e:
                    print(f"Error processing {filepath}: {e}")
                    eliminated_files.append(filepath)
                    eliminated_count += 1
    
    return processed_count, eliminated_count, padded_count, kept_as_is_count, eliminated_files

print("✓ Audio processing function loaded (NO TRIMMING - long clips kept as-is)")

In [None]:
# Preprocess audio files
PROCESSED_AUDIO_PATH = "./processed_train_audio"
print("Starting audio preprocessing...")
total, eliminated, padded, kept_as_is, eliminated_list = process_audio_clips(
    TRAIN_AUDIO_PATH, PROCESSED_AUDIO_PATH,
    padding_seconds=3.5
)
print(f"\nProcessing complete:")
print(f"  Total processed: {total}")
print(f"  Eliminated: {eliminated}")
print(f"  Padded (short clips): {padded}")
print(f"  Kept as-is (long clips): {kept_as_is}")
if eliminated_list:
    print(f"\nEliminated files:")
    for f in eliminated_list:
        print(f"  {f}")
TRAIN_AUDIO_PATH = PROCESSED_AUDIO_PATH  # Use processed audio going forward

## Load Dataset

In [None]:
# Load all training data
def load_dataset_from_folders():
    data = []
    
    for region in regions:
        csv_path = os.path.join(TRAIN_ANNOTATION_PATH, f"{region}.csv")
        df_region = pd.read_csv(csv_path)
        
        for _, row in df_region.iterrows():
            audio_path = os.path.join(TRAIN_AUDIO_PATH, region, row['audio'])
            if os.path.exists(audio_path):
                data.append({
                    'audio': audio_path,
                    'text': row['text'],
                    'region': region
                })
    
    return pd.DataFrame(data)

print("Loading dataset...")
df = load_dataset_from_folders()
print(f"Total samples: {len(df)}")
print(f"\nSamples per region:")
print(df['region'].value_counts())

## Weighted Sampling Setup

In [None]:
from collections import Counter

# Calculate class weights for balanced sampling
region_counts = Counter(df['region'])
total_samples = len(df)

# Inverse frequency weighting
class_weights = {region: total_samples / (len(region_counts) * count) 
                 for region, count in region_counts.items()}

# Normalize weights
sum_weights = sum(class_weights.values())
class_weights = {k: v/sum_weights * len(class_weights) for k, v in class_weights.items()}

print("\nClass weights (for balanced sampling):")
for region, weight in sorted(class_weights.items(), key=lambda x: x[1], reverse=True):
    print(f"  {region}: {weight:.3f} (n={region_counts[region]})")

# Create sample weights for each data point
df['sample_weight'] = df['region'].map(class_weights)

## Train/Val Split

In [None]:
from sklearn.model_selection import train_test_split

# Stratified split to maintain regional distribution
train_df, val_df = train_test_split(
    df, 
    test_size=0.20,  # 15% for validation
    stratify=df['region'],
    random_state=SEED
)

print(f"\nDataset split:")
print(f"  Training: {len(train_df)} samples")
print(f"  Validation: {len(val_df)} samples")

print(f"\nTraining set distribution:")
print(train_df['region'].value_counts())
print(f"\nValidation set distribution:")
print(val_df['region'].value_counts())

## Initialize Whisper Components

In [None]:
# Initialize model components
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language="Bengali", task="transcribe")
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language="Bengali", task="transcribe")

print(f"✓ Loaded Whisper processor from {MODEL_NAME}")
print(f"  Language: Bengali")
print(f"  Task: Transcribe")

## Custom Dataset with On-The-Fly Augmentation

In [None]:
class BengaliAudioDataset(TorchDataset):
    """
    Custom Dataset that applies augmentation ON-THE-FLY during training.
    Each epoch will see DIFFERENT augmented versions of the audio.
    """
    def __init__(self, dataframe, feature_extractor, tokenizer, apply_augmentation=False):
        self.df = dataframe.reset_index(drop=True)
        self.feature_extractor = feature_extractor
        self.tokenizer = tokenizer
        self.apply_augmentation = apply_augmentation
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        try:
            # Load audio
            audio, sr = librosa.load(row['audio'], sr=16000)
            
            # Apply augmentation ON-THE-FLY if enabled
            # This happens EVERY TIME the sample is accessed, so each epoch gets different augmentations
            if self.apply_augmentation and CONFIG['use_augmentation']:
                audio = apply_augmentation(audio, sr, CONFIG['augmentation_prob'])
            
            # Extract features
            input_features = self.feature_extractor(audio, sampling_rate=sr).input_features[0]
            
            # Encode labels
            labels = self.tokenizer(row['text']).input_ids
            
            return {
                'input_features': input_features,
                'labels': labels
            }
            
        except Exception as e:
            print(f"Error loading {row['audio']}: {e}")
            # Return a dummy sample (will be filtered out)
            return {
                'input_features': np.zeros((80, 3000)),  # Dummy features
                'labels': [0]  # Dummy label
            }

print("✓ Custom Dataset class defined")
print("  Augmentation will be applied ON-THE-FLY during training")
print("  Each epoch will see FRESH random augmentations")

In [None]:
# Create datasets with on-the-fly augmentation
print("\n" + "="*60)
print("CREATING DATASETS")
print("="*60)

# Training set WITH on-the-fly augmentation
train_dataset = BengaliAudioDataset(
    train_df, 
    feature_extractor, 
    tokenizer, 
    apply_augmentation=True  # ← Augmentation applied on-the-fly
)

# Validation set WITHOUT augmentation
val_dataset = BengaliAudioDataset(
    val_df, 
    feature_extractor, 
    tokenizer, 
    apply_augmentation=False  # ← No augmentation for validation
)

print(f"✓ Training dataset: {len(train_dataset)} samples (with on-the-fly augmentation)")
print(f"✓ Validation dataset: {len(val_dataset)} samples (no augmentation)")
print("\n⚡ KEY FEATURE: Augmentation happens DURING training, not preprocessing")
print("   This means each epoch sees DIFFERENT augmented versions!")
print("="*60)

## Data Collator

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad inputs
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Pad labels
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding with -100 to ignore loss
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # Remove BOS token if present
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
print("✓ Data collator initialized")

## Evaluation Metrics

In [None]:
!pip install jiwer

In [None]:
import evaluate
from jiwer import wer, cer

# Load metrics
metric_wer = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 with pad token id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Calculate WER
    wer_score = 100 * metric_wer.compute(predictions=pred_str, references=label_str)
    
    # Calculate character-level accuracy
    cer_score = 100 * cer(label_str, pred_str)
    char_accuracy = 100 - cer_score

    return {
        "wer": wer_score,
        "cer": cer_score,
        "char_accuracy": char_accuracy
    }

print("✓ Evaluation metrics defined (WER, CER, Character Accuracy)")

## Custom Trainer with Weighted Sampling

In [None]:
from torch.utils.data import WeightedRandomSampler

class WeightedSeq2SeqTrainer(Seq2SeqTrainer):
    """
    Custom trainer that uses WeightedRandomSampler for balanced regional sampling.
    The sampler resamples EVERY EPOCH automatically.
    """
    def __init__(self, sample_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.sample_weights = sample_weights
    
    def _get_train_sampler(self, dataset):  # ← ADD 'dataset' parameter here
        if self.sample_weights is not None and CONFIG['use_weighted_sampling']:
            print("  ✓ Using WeightedRandomSampler (resamples every epoch)")
            return WeightedRandomSampler(
                weights=self.sample_weights,
                num_samples=len(self.sample_weights),
                replacement=True
            )
        return super()._get_train_sampler(dataset)  # ← Pass 'dataset' to parent

# Prepare sample weights for trainer
train_sample_weights = torch.DoubleTensor(train_df['sample_weight'].values)

print("✓ Custom trainer with weighted sampling initialized")
print(f"  Using weighted sampling: {CONFIG['use_weighted_sampling']}")
print("  ⚡ WeightedRandomSampler automatically resamples each epoch")

## Load Model

In [None]:
# Load model
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)

# Configure model for Bengali
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

# Count trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())

print("\n" + "="*60)
print("MODEL CONFIGURATION (FULL FINE-TUNING)")
print("="*60)
print(f"Total parameters: {all_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Trainable %: {100 * trainable_params / all_params:.2f}%")
print("\nAll model parameters will be updated during training")
print("="*60)

## Training Arguments

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=CONFIG['batch_size'],
    per_device_eval_batch_size=CONFIG['batch_size'],
    gradient_accumulation_steps=CONFIG['gradient_accumulation'],
    learning_rate=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay'],
    warmup_steps=CONFIG['warmup_steps'],
    num_train_epochs=CONFIG['num_epochs'],
    
    # SAVE ONLY AT THE END
    eval_strategy="no",
    #eval_steps=CONFIG['eval_steps'],
    save_strategy="no",
    save_total_limit=None,
    load_best_model_at_end=False,
    
    # Optimization
    fp16=CONFIG['fp16'],
    gradient_checkpointing=False,
    optim="adamw_torch",
    
    # Generation
    predict_with_generate=True,
    generation_max_length=225,
    generation_num_beams=5,
    
    # Logging
    logging_steps=50,
    logging_dir=f"{OUTPUT_DIR}/logs",
    report_to=[],
    
    # Other
    remove_unused_columns=False,
    label_names=["labels"],
    push_to_hub=False,
    seed=SEED,
)

print("\n" + "="*60)
print("TRAINING CONFIGURATION")
print("="*60)
print(f"Batch size per device: {CONFIG['batch_size']}")
print(f"Gradient accumulation steps: {CONFIG['gradient_accumulation']}")
print(f"Effective batch size: {CONFIG['batch_size'] * CONFIG['gradient_accumulation']}")
print(f"Learning rate: {CONFIG['learning_rate']}")
print(f"Number of epochs: {CONFIG['num_epochs']}")
print(f"Mixed precision (FP16): {CONFIG['fp16']}")
print(f"\n⚡ PROPERLY INTEGRATED FEATURES:")
print(f"  ✓ On-the-fly augmentation per epoch: {CONFIG['use_augmentation']}")
print(f"  ✓ Weighted sampling per epoch: {CONFIG['use_weighted_sampling']}")
print("="*60)

## Initialize Trainer

In [None]:
# Initialize trainer with BOTH features properly integrated
trainer = WeightedSeq2SeqTrainer(
    sample_weights=train_sample_weights,  # ← WEIGHTED SAMPLING (per epoch)
    args=training_args,
    model=model,
    train_dataset=train_dataset,  # ← Contains on-the-fly augmentation (per epoch)
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    callbacks=[],
)

print("✓ Trainer initialized with PROPER per-epoch features")
print("\n⚡ VERIFICATION:")
print("  ✓ Augmentation: Applied in Dataset.__getitem__() → fresh each epoch")
print("  ✓ Weighted Sampling: WeightedRandomSampler → resamples each epoch")
print(f"  ✓ Early stopping patience: {CONFIG['early_stopping_patience']} evaluations")

## Training

In [None]:
print("="*60)
print("STARTING TRAINING")
print("="*60)
print(f"\nDataset:")
print(f"  Training samples: {len(train_dataset)}")
print(f"  Validation samples: {len(val_dataset)}")
print(f"\n⚡ PROPERLY INTEGRATED FEATURES (verified):")
print(f"  ✓ Audio preprocessing: NO TRIMMING (long clips kept as-is)")
print(f"  ✓ Augmentation: ON-THE-FLY per epoch (fresh augmentations each time)")
print(f"  ✓ Weighted sampling: PER EPOCH (balanced regional representation)")
print(f"\nTraining will start now...\n")

# Train
trainer.train()

In [None]:
print("\n" + "="*60)
print("TRAINING COMPLETED")
print("="*60)

## Evaluation

In [None]:
print("\nEvaluating on validation set...")
eval_results = trainer.evaluate()

print("\n" + "="*60)
print("VALIDATION RESULTS")
print("="*60)
for key, value in eval_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")
print("="*60)

## Save Model

In [None]:
# Save final model
final_model_dir = f"{OUTPUT_DIR}/final-model"
trainer.save_model(final_model_dir)
processor.save_pretrained(final_model_dir)

print("="*60)
print("MODEL SAVED")
print("="*60)
print(f"Model saved to: {final_model_dir}")
print("✓ Model weights")
print("✓ Processor configuration")
print("="*60)

## Training Statistics

In [None]:
# Get training history from logs
import json

log_history = trainer.state.log_history

# Extract key metrics
train_losses = [x['loss'] for x in log_history if 'loss' in x]
eval_wers = [x['eval_wer'] for x in log_history if 'eval_wer' in x]
eval_char_accs = [x['eval_char_accuracy'] for x in log_history if 'eval_char_accuracy' in x]

print("\n" + "="*60)
print("TRAINING STATISTICS")
print("="*60)
print(f"\nTotal training steps: {trainer.state.global_step}")
print(f"Epochs completed: {trainer.state.epoch}")
print(f"\nFinal training loss: {train_losses[-1]:.4f}")
print(f"Best validation WER: {min(eval_wers):.2f}%")
print(f"Best character accuracy: {max(eval_char_accs):.2f}%")
print(f"\nImprovement over training:")
if len(eval_wers) > 1:
    print(f"  WER improved by: {eval_wers[0] - min(eval_wers):.2f}%")
    print(f"  Char accuracy improved by: {max(eval_char_accs) - eval_char_accs[0]:.2f}%")
print("="*60)

## Sample Predictions

In [None]:
# Test on a few validation samples
print("\nTesting on sample predictions...")
print("="*60)

# Get a few samples
sample_indices = [0, 10, 20, 30, 40]

for idx in sample_indices[:5]:
    if idx >= len(val_dataset):
        break
    
    sample = val_dataset[idx]
    
    # Get prediction
    inputs = {
        'input_features': torch.tensor(sample['input_features']).unsqueeze(0).to(model.device)
    }
    
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=225, num_beams=5)
    
    prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    reference = tokenizer.decode(sample['labels'], skip_special_tokens=True)
    
    print(f"\nSample {idx + 1}:")
    print(f"  Reference:  {reference}")
    print(f"  Prediction: {prediction}")

print("\n" + "="*60)

## Load for Inference

In [None]:
# Load the final trained model
print("Loading trained model for inference...")

inference_model = WhisperForConditionalGeneration.from_pretrained(final_model_dir)
inference_processor = WhisperProcessor.from_pretrained(final_model_dir)

# Move to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inference_model.to(device)
inference_model.eval()

print(f"✓ Model loaded on {device}")
print(f"✓ Ready for inference")