# Ablation V0: CTC-Only Baseline (No LoRA)

This notebook is part of an ablation study to compare different fine-tuning approaches.

## Model and Approach
- **Base Model**: `agkphysics/wav2vec2-large-xlsr-53-amharic`
- **Fine-tuning Method**: CTC head only (no LoRA)
- **Task**: Automatic Speech Recognition (ASR)
- **Domain**: Legal Amharic text
- **Dataset**: Dataset_4.0h


## 1. Installation and Setup


In [None]:
# Force single GPU usage to avoid DataParallel issues on Kaggle
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'



In [1]:
%pip install -q transformers datasets accelerate peft torchaudio librosa jiwer soundfile matplotlib



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Union, Optional
import librosa
import soundfile as sf
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor
)

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType
)

from datasets import Dataset, DatasetDict
import jiwer

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


PyTorch version: 2.9.1
CUDA available: False


In [3]:
# Set random seed for reproducibility
SEED = 42  # Change to 123, 456 for different seeds

import random
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"Random seed set to: {SEED}")


Random seed set to: 42


In [None]:
# Kaggle: Dataset is uploaded as input dataset
# No need to mount Google Drive
print("Running on Kaggle - using input dataset")

## 2. Configuration


In [None]:
MODEL_NAME = "agkphysics/wav2vec2-large-xlsr-53-amharic"

# Dataset paths - Kaggle Input
# Assume Dataset_4.0h is uploaded to Kaggle as input dataset
# Update the path below to match your actual Kaggle input dataset name
# Kaggle input path format: /kaggle/input/your-dataset-name
# If your dataset is named "dataset-40h" in Kaggle, use: "/kaggle/input/dataset-40h"
# If your dataset is named "Dataset_4.0h", use: "/kaggle/input/Dataset_4.0h" (or whatever Kaggle shows)
BASE_DATASET_DIR = "/kaggle/input/dataset-4-0h/Dataset_4.0h"  # UPDATE THIS to match your Kaggle dataset name
# The dataset should contain: audio/, train.csv, val.csv, test.csv
AUDIO_DIR = f"{BASE_DATASET_DIR}/audio"
TRAIN_CSV = f"{BASE_DATASET_DIR}/train.csv"
VAL_CSV = f"{BASE_DATASET_DIR}/val.csv"
TEST_CSV = f"{BASE_DATASET_DIR}/test.csv"

# Verify paths exist
print("Dataset paths (Kaggle):")
print(f"  Audio directory: {AUDIO_DIR}")
print(f"  Train CSV: {TRAIN_CSV}")
print(f"  Val CSV: {VAL_CSV}")
print(f"  Test CSV: {TEST_CSV}")

# Check if paths exist
if not os.path.exists(AUDIO_DIR):
    print(f"\nWARNING: Audio directory not found: {AUDIO_DIR}")
    print("  Please ensure dataset is uploaded to Kaggle as input dataset")
if not os.path.exists(TRAIN_CSV):
    print(f"WARNING: Train CSV not found: {TRAIN_CSV}")
if not os.path.exists(VAL_CSV):
    print(f"WARNING: Val CSV not found: {VAL_CSV}")
if not os.path.exists(TEST_CSV):
    print(f"WARNING: Test CSV not found: {TEST_CSV}")

# Output directory - Kaggle working directory (persistent)
OUTPUT_DIR = "/kaggle/working/wav2vec2_ctc_only_baseline"

# Training configuration for Kaggle GPU
# Parameters match other ablation study notebooks
# Checkpoints will be saved every 500 steps to /kaggle/working/ (persistent)
TRAINING_ARGS = {
    "output_dir": OUTPUT_DIR,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "learning_rate": 1e-4,
    "warmup_steps": 500,
    "max_steps": 2000,
    "gradient_checkpointing": True,
    "fp16": True,
    "eval_strategy": "steps",
    "eval_steps": 500,
    "save_strategy": "steps",
    "save_steps": 500,
    "save_total_limit": 3,
    "load_best_model_at_end": True,
    "metric_for_best_model": "wer",
    "greater_is_better": False,
    "logging_steps": 100,
    "report_to": "none",
    "push_to_hub": False
}

print("\nConfiguration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  Max steps: {TRAINING_ARGS['max_steps']}")
print(f"  Batch size: {TRAINING_ARGS['per_device_train_batch_size']}")
print(f"  Gradient accumulation: {TRAINING_ARGS['gradient_accumulation_steps']}")
print(f"  Effective batch size: {TRAINING_ARGS['per_device_train_batch_size'] * TRAINING_ARGS['gradient_accumulation_steps']}")
print(f"  Learning rate: {TRAINING_ARGS['learning_rate']}")
print(f"  FP16: {TRAINING_ARGS['fp16']}")
print(f"  Gradient checkpointing: {TRAINING_ARGS['gradient_checkpointing']}")


üìç Running locally - using path: /Users/blank/Documents/Audio/Dataset_4.0h

Dataset paths:
  Audio directory: /Users/blank/Documents/Audio/Dataset_4.0h/audio
  Train CSV: /Users/blank/Documents/Audio/Dataset_4.0h/train.csv
  Val CSV: /Users/blank/Documents/Audio/Dataset_4.0h/val.csv
  Test CSV: /Users/blank/Documents/Audio/Dataset_4.0h/test.csv
‚ö†Ô∏è  TEST MODE ENABLED - Running quick validation (< 1 minute)
  üì± Detected Apple Silicon (MPS)
  ‚ö†Ô∏è  Using CPU for test mode to avoid MPS memory issues
  üí° Note: Full training on Kaggle will use CUDA with more memory
Configuration:
  Model: agkphysics/wav2vec2-large-xlsr-53-amharic
  Output directory: wav2vec2_ctc_only_baseline
  LoRA rank (r): 8
  LoRA alpha: 32

  ‚ö†Ô∏è  TEST MODE: Quick validation run (< 1 minute)
  ‚ö†Ô∏è  Set TEST_MODE = False for full training!
  Max steps: 2
  Batch size: 2


## 3. Load and Prepare Data


In [6]:
def load_csv_split(csv_path, audio_dir):
    """Load a CSV split and return list of (audio_path, transcription) tuples"""
    df = pd.read_csv(csv_path)

    data = []
    for _, row in df.iterrows():
        audio_path = Path(audio_dir) / row['file_name']
        transcription = str(row['transcription']).strip()

        if audio_path.exists():
            data.append({
                'audio_path': str(audio_path),
                'transcription': transcription
            })
        else:
            print(f"Warning: Audio file not found: {audio_path}")

    return data

train_data = load_csv_split(TRAIN_CSV, AUDIO_DIR)
val_data = load_csv_split(VAL_CSV, AUDIO_DIR)
test_data = load_csv_split(TEST_CSV, AUDIO_DIR)

print(f"Train samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")
print(f"\nTotal samples: {len(train_data) + len(val_data) + len(test_data)}")


Train samples: 824
Validation samples: 103
Test samples: 103

Total samples: 1030


## 4. Create Vocabulary and Processor


In [7]:
# Use the original model's processor instead of creating a new vocabulary
# This preserves the pre-trained CTC head weights
print("Loading original model processor to preserve vocabulary and CTC head...")
original_processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
print(f"Original vocabulary size: {len(original_processor.tokenizer)}")
print(f"First 20 characters: {list(original_processor.tokenizer.get_vocab().keys())[:20]}")


Loading original model processor to preserve vocabulary and CTC head...


Original vocabulary size: 234
First 20 characters: [' ', '</s>', '<pad>', '<s>', '<unk>', '·àÄ', '·àÅ', '·àÇ', '·àÉ', '·àÑ', '·àÖ', '·àÜ', '·àà', '·àâ', '·àä', '·àã', '·àå', '·àç', '·àé', '·àè']


In [8]:
# Use the original processor to maintain vocabulary compatibility
processor = original_processor
print("Using original model processor to preserve CTC head weights")


Using original model processor to preserve CTC head weights


In [None]:
# Device selection for Kaggle
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    device = torch.device('cpu')
    print("Using CPU (CUDA not available)")


‚ö†Ô∏è  TEST MODE: Using CPU to avoid MPS memory issues
Selected device: cpu


## 5. Load Model and Apply LoRA


In [10]:
# Load model WITHOUT reinitializing CTC head - use original vocabulary
model = Wav2Vec2ForCTC.from_pretrained(
    MODEL_NAME,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id
    # No vocab_size or ignore_mismatched_sizes - preserves original CTC head
)

print(f"Base model loaded: {MODEL_NAME}")
print(f"Vocabulary size: {len(processor.tokenizer)}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")

# Ensure CTC head (lm_head) is trainable
for param in model.lm_head.parameters():
    param.requires_grad = True
print(f"CTC head (lm_head) parameters: {sum(p.numel() for p in model.lm_head.parameters()) / 1e6:.2f}M")
print(f"CTC head trainable: {all(p.requires_grad for p in model.lm_head.parameters())}")

if torch.cuda.is_available():
    model = model.to("cuda")
    print("Model moved to CUDA")


Base model loaded: agkphysics/wav2vec2-large-xlsr-53-amharic
Vocabulary size: 234
Model parameters: 315.68M
CTC head (lm_head) parameters: 0.24M
CTC head trainable: True


In [11]:
# CTC-ONLY BASELINE: No LoRA adapters, only CTC head is trainable
# This baseline tests if fine-tuning just the CTC head is sufficient

print("CTC-only baseline: Training only the CTC head (no LoRA)")
print(f"CTC head (lm_head) parameters: {sum(p.numel() for p in model.lm_head.parameters()) / 1e6:.2f}M")
print(f"CTC head trainable: {all(p.requires_grad for p in model.lm_head.parameters())}")

# Skip LoRA application - model is ready for training
print("\nModel ready for training (CTC head only)")


CTC-only baseline: Training only the CTC head (no LoRA)
CTC head (lm_head) parameters: 0.24M
CTC head trainable: True

Model ready for training (CTC head only)


## 6. Prepare Dataset


In [12]:
def speech_file_to_array_fn(path):
    """Load audio file and resample to 16kHz"""
    speech_array, sampling_rate = librosa.load(path, sr=16000)
    return speech_array

def prepare_dataset(batch):
    """Process a batch of audio and transcriptions"""
    audio = [speech_file_to_array_fn(path) for path in batch["audio_path"]]

    # Process audio - return as lists (no return_tensors)
    audio_features = processor.feature_extractor(
        audio,
        sampling_rate=16000
        # No padding, no return_tensors here - collator handles it
    )
    # Only store input_values, not attention_mask
    batch["input_values"] = audio_features.input_values  # This will be a list of arrays

    # Process text using tokenizer directly - return as lists
    batch["labels"] = [processor.tokenizer(transcription, add_special_tokens=False)["input_ids"] for transcription in batch["transcription"]]

    # Make sure we're not accidentally storing attention_mask
    if "attention_mask" in batch:
        del batch["attention_mask"]

    return batch

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

train_dataset = train_dataset.map(
    prepare_dataset,
    remove_columns=train_dataset.column_names,
    batch_size=100,
    batched=True
)

val_dataset = val_dataset.map(
    prepare_dataset,
    remove_columns=val_dataset.column_names,
    batch_size=100,
    batched=True
)

test_dataset = test_dataset.map(
    prepare_dataset,
    remove_columns=test_dataset.column_names,
    batch_size=100,
    batched=True
)

print("Datasets prepared:")
print(f"  Train: {len(train_dataset)} samples")
print(f"  Validation: {len(val_dataset)} samples")
print(f"  Test: {len(test_dataset)} samples")


Map:   0%|          | 0/824 [00:00<?, ? examples/s]

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 824/824 [00:03<00:00, 225.06 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 275.49 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 306.78 examples/s]

Datasets prepared:
  Train: 824 samples
  Validation: 103 samples
  Test: 103 samples





In [None]:
# Move model to device
if torch.cuda.is_available():
    model = model.to("cuda")
    print("Model moved to CUDA")
else:
    model = model.to("cpu")
    print("Model moved to CPU")


‚ö†Ô∏è  Moving model to CPU for test mode (MPS memory constraints)
‚úÖ Model moved to CPU


## 7. Data Collator


In [None]:
# Pre-training summary
print("\n" + "="*70)
print("Training Configuration")
print("="*70)
print(f"Max steps: {TRAINING_ARGS['max_steps']}")
print(f"Batch size: {TRAINING_ARGS['per_device_train_batch_size']}")
print(f"Gradient accumulation: {TRAINING_ARGS['gradient_accumulation_steps']}")
print(f"Effective batch size: {TRAINING_ARGS['per_device_train_batch_size'] * TRAINING_ARGS['gradient_accumulation_steps']}")
print(f"Learning rate: {TRAINING_ARGS['learning_rate']}")
print(f"FP16: {TRAINING_ARGS['fp16']}")
print(f"Gradient checkpointing: {TRAINING_ARGS['gradient_checkpointing']}")
print("="*70 + "\n")



‚ö†Ô∏è  TEST MODE ENABLED - Quick Validation Run
This will run only 2 training steps to verify setup.
Expected runtime: < 1 minute

To run FULL training:
  1. Go to Configuration cell (Cell 7)
  2. Change: TEST_MODE = False
  3. Re-run from Configuration cell onwards



In [15]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        import torch

        # Extract input_values and labels - explicitly ignore attention_mask if present
        input_values_list = [feature["input_values"] for feature in features]
        label_features = [feature["labels"] for feature in features]

        # Convert audio to tensors and pad manually
        input_values_tensors = []
        for iv in input_values_list:
            if isinstance(iv, torch.Tensor):
                input_values_tensors.append(iv)
            elif isinstance(iv, np.ndarray):
                input_values_tensors.append(torch.tensor(iv, dtype=torch.float32))
            else:
                input_values_tensors.append(torch.tensor(np.array(iv), dtype=torch.float32))

        # Pad audio sequences
        input_values = torch.nn.utils.rnn.pad_sequence(
            input_values_tensors,
            batch_first=True,
            padding_value=0.0
        )

        # Pad labels manually
        max_label_len = max(len(labels) for labels in label_features)
        pad_token_id = self.processor.tokenizer.pad_token_id

        padded_labels = []
        for labels in label_features:
            if isinstance(labels, torch.Tensor):
                labels = labels.tolist()

            padding_length = max_label_len - len(labels)
            padded = labels + [pad_token_id] * padding_length
            padded_labels.append(padded)

        labels_tensor = torch.tensor(padded_labels, dtype=torch.long)

        # Mask padded labels with -100
        attention_mask_labels = (labels_tensor != pad_token_id).long()
        labels_tensor = labels_tensor.masked_fill(attention_mask_labels.ne(1), -100)

        # Return ONLY input_values and labels - explicitly create a new dict
        batch = {}
        batch["input_values"] = input_values
        batch["labels"] = labels_tensor

        # Explicitly ensure no other keys are present
        return batch
# Add these lines:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
print("Data collator created")

Data collator created


In [None]:
# Post-training summary
print("\n" + "="*70)
print("Training completed")
print("="*70)
if 'train_result' in locals():
    print(f"Training loss: {train_result.training_loss:.4f}")
if 'final_model_path' in locals():
    print(f"Model saved to: {final_model_path}")
print("="*70)


‚úÖ TEST MODE VALIDATION COMPLETED SUCCESSFULLY!
All components verified:
  ‚úì Model loading
  ‚úì Data processing
  ‚úì Training loop
  ‚úì Evaluation metrics

‚ö†Ô∏è  IMPORTANT: This was a QUICK TEST (2 steps only)
   For full training on Kaggle:
   1. Set TEST_MODE = False in Configuration cell
   2. Re-run the notebook


## 8. Evaluation Metrics


In [17]:
def compute_metrics(pred):
    """Compute WER (Word Error Rate) and CER (Character Error Rate)"""
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = jiwer.wer(label_str, pred_str)
    cer = jiwer.cer(label_str, pred_str)

    return {"wer": wer, "cer": cer}

print("Evaluation metrics function created")


Evaluation metrics function created


## 9. Training Arguments and Trainer


In [18]:
training_args = TrainingArguments(**TRAINING_ARGS)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("Trainer initialized")
print(f"\nTraining configuration:")
print(f"  Max steps: {training_args.max_steps}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Evaluation steps: {training_args.eval_steps}")


Trainer initialized

Training configuration:
  Max steps: 2
  Batch size: 2
  Gradient accumulation: 1
  Effective batch size: 2
  Learning rate: 0.0001
  Evaluation steps: 2


## 10. Train Model


In [None]:
# Create output directory for Kaggle (persistent)
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory created: {OUTPUT_DIR}")
print(f"Checkpoints will be saved every {TRAINING_ARGS['save_steps']} steps")
print(f"All outputs will be saved to Kaggle working directory (persistent)")


In [None]:
# Ensure model is not wrapped in DataParallel and is on single device
if isinstance(model, torch.nn.DataParallel):
    print("Unwrapping model from DataParallel...")
    model = model.module

# Move to single device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(f"Model on device: {device}")
print(f"Visible GPUs: {torch.cuda.device_count()}")


In [19]:
train_result = trainer.train()

# Save final model
final_model_path = f"{OUTPUT_DIR}_final"
trainer.save_model(final_model_path)
processor.save_pretrained(final_model_path)

print("\nTraining completed!")
print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Final model saved to: {final_model_path}")

NotImplementedError: The operator 'aten::_ctc_loss' is not currently implemented for the MPS device. If you want this op to be considered for addition please comment on https://github.com/pytorch/pytorch/issues/141287 and mention use-case, that resulted in missing op as well as commit hash 5811a8d7da873dd699ff6687092c225caffcf1bb. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.

## Training Visualizations

Visualize training progress, validation metrics, and model performance.


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# Extract training history from trainer state
train_history = trainer.state.log_history

# Separate training and evaluation logs
train_logs = [log for log in train_history if 'loss' in log and 'eval_loss' not in log]
eval_logs = [log for log in train_history if 'eval_loss' in log]

# Extract data
train_steps = [log['step'] for log in train_logs]
train_losses = [log['loss'] for log in train_logs]

eval_steps = [log['step'] for log in eval_logs]
eval_losses = [log['eval_loss'] for log in eval_logs]
eval_wers = [log.get('eval_wer', 0) for log in eval_logs]
eval_cers = [log.get('eval_cer', 0) for log in eval_logs]

# Create comprehensive figure
fig = plt.figure(figsize=(16, 10))
gs = fig.add_gridspec(3, 2, hspace=0.3, wspace=0.3)

# Plot 1: Training Loss (top left, spans 2 columns)
ax1 = fig.add_subplot(gs[0, :])
ax1.plot(train_steps, train_losses, 'b-', linewidth=2.5, label='Training Loss', alpha=0.8)
ax1.set_xlabel('Training Steps', fontsize=13, fontweight='bold')
ax1.set_ylabel('Loss', fontsize=13, fontweight='bold')
ax1.set_title('Training Loss Over Time', fontsize=15, fontweight='bold', pad=15)
ax1.grid(True, alpha=0.3, linestyle='--')
ax1.legend(fontsize=12, loc='best')
ax1.set_facecolor('#f8f9fa')

# Plot 2: Validation Loss
ax2 = fig.add_subplot(gs[1, 0])
if eval_losses:
    ax2.plot(eval_steps, eval_losses, 'r-', linewidth=2.5, marker='o', markersize=5, label='Validation Loss', alpha=0.8)
    ax2.set_xlabel('Training Steps', fontsize=12, fontweight='bold')
    ax2.set_ylabel('Loss', fontsize=12, fontweight='bold')
    ax2.set_title('Validation Loss', fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3, linestyle='--')
    ax2.legend(fontsize=11)
    ax2.set_facecolor('#f8f9fa')
else:
    ax2.text(0.5, 0.5, 'No validation data', ha='center', va='center', fontsize=12)
    ax2.set_title('Validation Loss', fontsize=14, fontweight='bold')

# Plot 3: Word Error Rate (WER)
ax3 = fig.add_subplot(gs[1, 1])
if eval_wers and any(w > 0 for w in eval_wers):
    ax3.plot(eval_steps, eval_wers, 'g-', linewidth=2.5, marker='s', markersize=5, label='WER', alpha=0.8)
    ax3.set_xlabel('Training Steps', fontsize=12, fontweight='bold')
    ax3.set_ylabel('Word Error Rate', fontsize=12, fontweight='bold')
    ax3.set_title('Word Error Rate (WER)', fontsize=14, fontweight='bold')
    ax3.grid(True, alpha=0.3, linestyle='--')
    ax3.legend(fontsize=11)
    ax3.set_ylim(bottom=0)
    ax3.set_facecolor('#f8f9fa')
else:
    ax3.text(0.5, 0.5, 'No WER data', ha='center', va='center', fontsize=12)
    ax3.set_title('Word Error Rate (WER)', fontsize=14, fontweight='bold')

# Plot 4: Character Error Rate (CER)
ax4 = fig.add_subplot(gs[2, 0])
if eval_cers and any(c > 0 for c in eval_cers):
    ax4.plot(eval_steps, eval_cers, 'm-', linewidth=2.5, marker='^', markersize=5, label='CER', alpha=0.8)
    ax4.set_xlabel('Training Steps', fontsize=12, fontweight='bold')
    ax4.set_ylabel('Character Error Rate', fontsize=12, fontweight='bold')
    ax4.set_title('Character Error Rate (CER)', fontsize=14, fontweight='bold')
    ax4.grid(True, alpha=0.3, linestyle='--')
    ax4.legend(fontsize=11)
    ax4.set_ylim(bottom=0)
    ax4.set_facecolor('#f8f9fa')
else:
    ax4.text(0.5, 0.5, 'No CER data', ha='center', va='center', fontsize=12)
    ax4.set_title('Character Error Rate (CER)', fontsize=14, fontweight='bold')

# Plot 5: Combined Loss Plot (Training vs Validation)
ax5 = fig.add_subplot(gs[2, 1])
if eval_losses:
    ax5.plot(train_steps, train_losses, 'b-', linewidth=2, label='Training Loss', alpha=0.7)
    ax5.plot(eval_steps, eval_losses, 'r-', linewidth=2, marker='o', markersize=4, label='Validation Loss', alpha=0.7)
    ax5.set_xlabel('Training Steps', fontsize=12, fontweight='bold')
    ax5.set_ylabel('Loss', fontsize=12, fontweight='bold')
    ax5.set_title('Training vs Validation Loss', fontsize=14, fontweight='bold')
    ax5.grid(True, alpha=0.3, linestyle='--')
    ax5.legend(fontsize=11)
    ax5.set_facecolor('#f8f9fa')
else:
    ax5.plot(train_steps, train_losses, 'b-', linewidth=2, label='Training Loss', alpha=0.7)
    ax5.set_xlabel('Training Steps', fontsize=12, fontweight='bold')
    ax5.set_ylabel('Loss', fontsize=12, fontweight='bold')
    ax5.set_title('Training Loss', fontsize=14, fontweight='bold')
    ax5.grid(True, alpha=0.3, linestyle='--')
    ax5.legend(fontsize=11)
    ax5.set_facecolor('#f8f9fa')

# Add overall title
fig.suptitle(f'Training Progress: {OUTPUT_DIR}', fontsize=18, fontweight='bold', y=0.995)

plt.tight_layout(rect=[0, 0, 1, 0.99])
plt.show()

# Save high-resolution plot
plot_path = f"{OUTPUT_DIR}_training_plots.png"
fig.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='white')
print(f"\nTraining plots saved to: {plot_path}")

# Print detailed summary statistics
print("\n" + "="*70)
print(f"TRAINING SUMMARY: {OUTPUT_DIR}")
print("="*70)
if train_losses:
    print(f"\nTraining Loss:")
    print(f"  Initial: {train_losses[0]:.4f}")
    print(f"  Final: {train_losses[-1]:.4f}")
    print(f"  Best: {min(train_losses):.4f} (at step {train_steps[train_losses.index(min(train_losses))]})")
    print(f"  Improvement: {((train_losses[0] - min(train_losses)) / train_losses[0] * 100):.2f}%")
if eval_losses:
    print(f"\nValidation Loss:")
    print(f"  Initial: {eval_losses[0]:.4f}")
    print(f"  Final: {eval_losses[-1]:.4f}")
    print(f"  Best: {min(eval_losses):.4f} (at step {eval_steps[eval_losses.index(min(eval_losses))]})")
    print(f"  Improvement: {((eval_losses[0] - min(eval_losses)) / eval_losses[0] * 100):.2f}%")
if eval_wers and any(w > 0 for w in eval_wers):
    valid_wers = [w for w in eval_wers if w > 0]
    valid_steps = [eval_steps[i] for i, w in enumerate(eval_wers) if w > 0]
    print(f"\nWord Error Rate (WER):")
    print(f"  Initial: {valid_wers[0]:.4f}")
    print(f"  Final: {valid_wers[-1]:.4f}")
    print(f"  Best: {min(valid_wers):.4f} (at step {valid_steps[valid_wers.index(min(valid_wers))]})")
    print(f"  Improvement: {((valid_wers[0] - min(valid_wers)) / valid_wers[0] * 100):.2f}%")
if eval_cers and any(c > 0 for c in eval_cers):
    valid_cers = [c for c in eval_cers if c > 0]
    valid_steps = [eval_steps[i] for i, c in enumerate(eval_cers) if c > 0]
    print(f"\nCharacter Error Rate (CER):")
    print(f"  Initial: {valid_cers[0]:.4f}")
    print(f"  Final: {valid_cers[-1]:.4f}")
    print(f"  Best: {min(valid_cers):.4f} (at step {valid_steps[valid_cers.index(min(valid_cers))]})")
    print(f"  Improvement: {((valid_cers[0] - min(valid_cers)) / valid_cers[0] * 100):.2f}%")
print("="*70)


In [None]:
# Training and Validation Loss Plot (Epoch-based)
import matplotlib.pyplot as plt
import numpy as np

# Extract training history from trainer state
train_history = trainer.state.log_history

# Separate training and evaluation logs
train_logs = [log for log in train_history if 'loss' in log and 'eval_loss' not in log]
eval_logs = [log for log in train_history if 'eval_loss' in log]

# Extract epochs and losses
# Group training logs by epoch
train_by_epoch = {}
for log in train_logs:
    epoch = log.get('epoch', 0)
    if epoch not in train_by_epoch:
        train_by_epoch[epoch] = []
    train_by_epoch[epoch].append(log.get('loss', 0))

# Average losses per epoch
epochs = sorted(train_by_epoch.keys())
train_losses = [np.mean(train_by_epoch[epoch]) for epoch in epochs]

# Extract validation losses by epoch
eval_by_epoch = {}
for log in eval_logs:
    epoch = log.get('epoch', 0)
    if epoch not in eval_by_epoch:
        eval_by_epoch[epoch] = []
    eval_by_epoch[epoch].append(log.get('eval_loss', 0))

# Average validation losses per epoch
eval_epochs = sorted(eval_by_epoch.keys())
eval_losses = [np.mean(eval_by_epoch[epoch]) for epoch in eval_epochs]

# Create the plot matching the image style
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_losses, 'b-', linewidth=2, label='Training Loss', alpha=0.8)
if eval_epochs:
    plt.plot(eval_epochs, eval_losses, 'orange', linewidth=2, label='Validation Loss', alpha=0.8)

plt.xlabel('Epochs', fontsize=12, fontweight='bold')
plt.ylabel('Loss', fontsize=12, fontweight='bold')
plt.title('Training and Validation Loss', fontsize=14, fontweight='bold', pad=15)
plt.legend(fontsize=11, loc='best')
plt.grid(True, alpha=0.3, linestyle='--')
plt.tight_layout()
plt.show()

# Save the plot
plot_path = f"{OUTPUT_DIR}_loss_epochs.png"
plt.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='white')
print(f"Epoch-based loss plot saved to: {plot_path}")


## 11. Zip and Copy Model to Google Drive


In [None]:
import zipfile
from pathlib import Path

# Zip the final model directory (saved to Kaggle working directory)
zip_filename = f"/kaggle/working/{Path(final_model_path).name}.zip"
print(f"Creating zip file: {zip_filename}...")

with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as zipf:
    for file_path in Path(final_model_path).rglob("*"):
        if file_path.is_file():
            # Get relative path for archive
            arcname = file_path.relative_to(final_model_path)
            zipf.write(file_path, arcname)
            print(f"  Added: {arcname}")

print(f"\nZip file created: {zip_filename}")
print(f"File size: {Path(zip_filename).stat().st_size / (1024*1024):.2f} MB")
print(f"\nModel and zip file saved to Kaggle working directory (persistent)")


Creating zip file: wav2vec2_lora_amharic_legal_final.zip...
  Added: added_tokens.json
  Added: preprocessor_config.json
  Added: special_tokens_map.json
  Added: tokenizer_config.json
  Added: adapter_config.json
  Added: training_args.bin
  Added: adapter_model.safetensors
  Added: README.md
  Added: vocab.json

Zip file created: wav2vec2_lora_amharic_legal_final.zip
Model zip file copied to Google Drive: /content/drive/MyDrive/wav2vec2_lora_amharic_legal_final.zip

File size: 6.21 MB


## 12. Final Evaluation on Test Set


In [None]:
final_model_path = f"{OUTPUT_DIR}_final"
trainer.save_model(final_model_path)
processor.save_pretrained(final_model_path)

print(f"Final model saved to: {final_model_path}")
print("\nFiles saved:")
print("  - LoRA adapters (adapter_model.bin, adapter_config.json)")
print("  - Processor (tokenizer, feature_extractor)")
print("  - Training configuration")


Final model saved to: wav2vec2_lora_amharic_legal_final

Files saved:
  - LoRA adapters (adapter_model.bin, adapter_config.json)
  - Processor (tokenizer, feature_extractor)
  - Training configuration


## 13. Inference Example


In [None]:
def transcribe_audio(model, processor, audio_path):
    """Transcribe a single audio file"""
    speech, _ = librosa.load(audio_path, sr=16000)

    inputs = processor(
        speech,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    )

    if torch.cuda.is_available():
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(inputs.input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

print("Inference function created")
print("\nExample usage:")
print("  transcription = transcribe_audio(model, processor, 'path/to/audio.mp3')")
print("  print(transcription)")


Inference function created

Example usage:
  transcription = transcribe_audio(model, processor, 'path/to/audio.mp3')
  print(transcription)


## 14. Load Model for Inference (After Training)


In [None]:
from peft import PeftModel

def load_trained_model(base_model_name, adapter_path, processor_path):
    """Load base model and LoRA adapters"""
    processor = Wav2Vec2Processor.from_pretrained(processor_path)

    base_model = Wav2Vec2ForCTC.from_pretrained(
        base_model_name,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
        vocab_size=len(processor.tokenizer)
    )

    model = PeftModel.from_pretrained(base_model, adapter_path)

    if torch.cuda.is_available():
        model = model.to("cuda")

    model.eval()
    return model, processor

print("Model loading function created")
print("\nExample usage:")
print("  model, processor = load_trained_model(")
print("      MODEL_NAME,")
print(f"      '{OUTPUT_DIR}_final',")
print(f"      '{OUTPUT_DIR}_final'")
print("  )")


Model loading function created

Example usage:
  model, processor = load_trained_model(
      MODEL_NAME,
      'wav2vec2_lora_amharic_legal_final',
      'wav2vec2_lora_amharic_legal_final'
  )
