# Fine-tuning Wav2Vec2 for Amharic Legal ASR with LoRA

This notebook fine-tunes `agkphysics/wav2vec2-large-xlsr-53-amharic` for legal domain Amharic speech recognition using LoRA (Low-Rank Adaptation) for efficient training on Colab free tier.

## Model and Approach
- **Base Model**: `agkphysics/wav2vec2-large-xlsr-53-amharic`
- **Fine-tuning Method**: LoRA (via PEFT library)
- **Task**: Automatic Speech Recognition (ASR)
- **Domain**: Legal Amharic text


## 1. Installation and Setup


In [2]:
%pip install -q transformers datasets accelerate peft torchaudio librosa jiwer soundfile


In [23]:
import os
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Union, Optional
import librosa
import soundfile as sf
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor
)

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType
)

from datasets import Dataset, DatasetDict
import jiwer

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


PyTorch version: 2.9.0+cu126
CUDA available: True
CUDA device: Tesla T4
CUDA memory: 15.83 GB


In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 2. Configuration


In [25]:
MODEL_NAME = "agkphysics/wav2vec2-large-xlsr-53-amharic"

AUDIO_DIR = "/content/drive/MyDrive/Dataset_4.0h/audio"
TRAIN_CSV = "/content/drive/MyDrive/Dataset_4.0h/train.csv"
VAL_CSV = "/content/drive/MyDrive/Dataset_4.0h/val.csv"
TEST_CSV = "/content/drive/MyDrive/Dataset_4.0h/test.csv"

OUTPUT_DIR = "wav2vec2_lora_amharic_legal_v2"

LORA_CONFIG = {
    "r": 8,
    "lora_alpha": 32,
    "target_modules": ["q_proj", "v_proj", "k_proj", "out_proj"],
    "lora_dropout": 0.1,
    "bias": "none",
    "task_type": "FEATURE_EXTRACTION"
}

TRAINING_ARGS = {
    "output_dir": OUTPUT_DIR,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "learning_rate": 1e-4,  # Lowered from 3e-4 for more stable training
    "warmup_steps": 500,
    "max_steps": 2000,  # Increased steps for larger dataset
    "gradient_checkpointing": True,
    "fp16": True,
    "eval_strategy": "steps",
    "eval_steps": 400,  # You might want to reduce this too, e.g., 300 or 200
    "save_strategy": "steps",
    "save_steps": 400,  # You might want to reduce this too, e.g., 300 or 200
    "save_total_limit": 3,
    "load_best_model_at_end": True,
    "metric_for_best_model": "wer",
    "greater_is_better": False,
    "logging_steps": 100,
    "report_to": "none",
    "push_to_hub": False
}

print("Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  LoRA rank (r): {LORA_CONFIG['r']}")
print(f"  LoRA alpha: {LORA_CONFIG['lora_alpha']}")


Configuration:
  Model: agkphysics/wav2vec2-large-xlsr-53-amharic
  Output directory: wav2vec2_lora_amharic_legal_v2
  LoRA rank (r): 8
  LoRA alpha: 32


## 3. Load and Prepare Data


In [26]:
def load_csv_split(csv_path, audio_dir):
    """Load a CSV split and return list of (audio_path, transcription) tuples"""
    df = pd.read_csv(csv_path)

    data = []
    for _, row in df.iterrows():
        audio_path = Path(audio_dir) / row['file_name']
        transcription = str(row['transcription']).strip()

        if audio_path.exists():
            data.append({
                'audio_path': str(audio_path),
                'transcription': transcription
            })
        else:
            print(f"Warning: Audio file not found: {audio_path}")

    return data

train_data = load_csv_split(TRAIN_CSV, AUDIO_DIR)
val_data = load_csv_split(VAL_CSV, AUDIO_DIR)
test_data = load_csv_split(TEST_CSV, AUDIO_DIR)

print(f"Train samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")
print(f"\nTotal samples: {len(train_data) + len(val_data) + len(test_data)}")


Train samples: 302
Validation samples: 37
Test samples: 39

Total samples: 378


## 4. Create Vocabulary and Processor


In [27]:
# Use the original model's processor instead of creating a new vocabulary
# This preserves the pre-trained CTC head weights
print("Loading original model processor to preserve vocabulary and CTC head...")
original_processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
print(f"Original vocabulary size: {len(original_processor.tokenizer)}")
print(f"First 20 characters: {list(original_processor.tokenizer.get_vocab().keys())[:20]}")


Vocabulary size: 170
First 20 characters: [' ', 'ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ', 'ለ', 'ሉ', 'ሊ', 'ላ', 'ሌ', 'ል', 'ሎ', 'መ', 'ሙ', 'ሚ', 'ማ', 'ሜ']


In [28]:
# Use the original processor to maintain vocabulary compatibility
processor = original_processor
print("Using original model processor to preserve CTC head weights")


Processor created successfully


## 5. Load Model and Apply LoRA


In [29]:
# Load model WITHOUT reinitializing CTC head - use original vocabulary
model = Wav2Vec2ForCTC.from_pretrained(
    MODEL_NAME,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id
    # No vocab_size or ignore_mismatched_sizes - preserves original CTC head
)

print(f"Base model loaded: {MODEL_NAME}")
print(f"Vocabulary size: {len(processor.tokenizer)}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")

# Ensure CTC head (lm_head) is trainable
for param in model.lm_head.parameters():
    param.requires_grad = True
print(f"CTC head (lm_head) parameters: {sum(p.numel() for p in model.lm_head.parameters()) / 1e6:.2f}M")
print(f"CTC head trainable: {all(p.requires_grad for p in model.lm_head.parameters())}")

if torch.cuda.is_available():
    model = model.to("cuda")
    print("Model moved to CUDA")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at agkphysics/wav2vec2-large-xlsr-53-amharic and are newly initialized because the shapes did not match:
- lm_head.bias: found shape torch.Size([234]) in the checkpoint and torch.Size([172]) in the model instantiated
- lm_head.weight: found shape torch.Size([234, 1024]) in the checkpoint and torch.Size([172, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model loaded: agkphysics/wav2vec2-large-xlsr-53-amharic
Vocabulary size: 172
Model parameters: 315.62M
Model moved to CUDA


In [30]:
lora_config = LoraConfig(
    r=LORA_CONFIG["r"],
    lora_alpha=LORA_CONFIG["lora_alpha"],
    target_modules=LORA_CONFIG["target_modules"],
    lora_dropout=LORA_CONFIG["lora_dropout"],
    bias=LORA_CONFIG["bias"],
    task_type=TaskType.FEATURE_EXTRACTION
)

model = get_peft_model(model, lora_config)

# Ensure CTC head remains trainable after LoRA application
for param in model.base_model.lm_head.parameters():
    param.requires_grad = True

model.print_trainable_parameters()

# Count trainable parameters including CTC head
lm_head_params = sum(p.numel() for p in model.base_model.lm_head.parameters() if p.requires_grad)
print(f"\nCTC head (lm_head) trainable parameters: {lm_head_params / 1e6:.2f}M")

print("\nLoRA adapters applied successfully")

# Wrap forward method to filter out input_ids and inputs_embeds (PEFT adds them but Wav2Vec2ForCTC doesn't need them)
original_base_forward = model.base_model.forward

def filtered_base_forward(*args, **kwargs):
    # Remove input_ids and inputs_embeds if present (PEFT adds them but Wav2Vec2ForCTC doesn't need them)
    # Wav2Vec2ForCTC only expects input_values, not input_ids or inputs_embeds
    if 'input_ids' in kwargs:
        del kwargs['input_ids']
    if 'inputs_embeds' in kwargs:
        del kwargs['inputs_embeds']
    return original_base_forward(*args, **kwargs)

model.base_model.forward = filtered_base_forward
print("Model base forward method wrapped to filter input_ids and inputs_embeds")

# RIGHT AFTER applying LoRA and before creating Trainer
# Solution 1: Patch PEFT's save function
from peft.utils.save_and_load import get_peft_model_state_dict
import peft.utils.save_and_load

_original_get_peft_state_dict = get_peft_model_state_dict

def safe_get_peft_state_dict(model, state_dict=None, adapter_name="default",
                              unwrap_compiled=False, save_embedding_layers=False, **kwargs):
    """Skip embedding layers for Wav2Vec2 models"""
    if hasattr(model, 'base_model'):
        base_class = model.base_model.__class__.__name__
        if 'Wav2Vec2' in base_class:
            save_embedding_layers = False
    elif 'Wav2Vec2' in model.__class__.__name__:
        save_embedding_layers = False

    return _original_get_peft_state_dict(
        model, state_dict=state_dict, adapter_name=adapter_name,
        unwrap_compiled=unwrap_compiled,
        save_embedding_layers=save_embedding_layers, **kwargs
    )

peft.utils.save_and_load.get_peft_model_state_dict = safe_get_peft_state_dict

# Solution 2: Patch the Wav2Vec2Model class
from transformers import Wav2Vec2Model
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2ForCTC

Wav2Vec2Model.get_input_embeddings = lambda self: None
Wav2Vec2Model.get_output_embeddings = lambda self: None
Wav2Vec2Model.set_input_embeddings = lambda self, x: None
Wav2Vec2Model.set_output_embeddings = lambda self, x: None

Wav2Vec2ForCTC.get_input_embeddings = lambda self: None
Wav2Vec2ForCTC.get_output_embeddings = lambda self: (self.lm_head if hasattr(self, 'lm_head') else None)
Wav2Vec2ForCTC.set_input_embeddings = lambda self, x: None
Wav2Vec2ForCTC.set_output_embeddings = lambda self, x: (setattr(self, 'lm_head', x) if hasattr(self, 'lm_head') else None)

print("Applied comprehensive PEFT compatibility patches for Wav2Vec2")

trainable params: 1,572,864 || all params: 317,187,884 || trainable%: 0.4959

LoRA adapters applied successfully
Model base forward method wrapped to filter input_ids and inputs_embeds
Applied comprehensive PEFT compatibility patches for Wav2Vec2


## 6. Prepare Dataset


In [31]:
def speech_file_to_array_fn(path):
    """Load audio file and resample to 16kHz"""
    speech_array, sampling_rate = librosa.load(path, sr=16000)
    return speech_array

def prepare_dataset(batch):
    """Process a batch of audio and transcriptions"""
    audio = [speech_file_to_array_fn(path) for path in batch["audio_path"]]

    # Process audio - return as lists (no return_tensors)
    audio_features = processor.feature_extractor(
        audio,
        sampling_rate=16000
        # No padding, no return_tensors here - collator handles it
    )
    # Only store input_values, not attention_mask
    batch["input_values"] = audio_features.input_values  # This will be a list of arrays

    # Process text using tokenizer directly - return as lists
    batch["labels"] = [tokenizer(transcription, add_special_tokens=False)["input_ids"] for transcription in batch["transcription"]]

    # Make sure we're not accidentally storing attention_mask
    if "attention_mask" in batch:
        del batch["attention_mask"]

    return batch

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

train_dataset = train_dataset.map(
    prepare_dataset,
    remove_columns=train_dataset.column_names,
    batch_size=100,
    batched=True
)

val_dataset = val_dataset.map(
    prepare_dataset,
    remove_columns=val_dataset.column_names,
    batch_size=100,
    batched=True
)

test_dataset = test_dataset.map(
    prepare_dataset,
    remove_columns=test_dataset.column_names,
    batch_size=100,
    batched=True
)

print("Datasets prepared:")
print(f"  Train: {len(train_dataset)} samples")
print(f"  Validation: {len(val_dataset)} samples")
print(f"  Test: {len(test_dataset)} samples")


Map:   0%|          | 0/302 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Datasets prepared:
  Train: 302 samples
  Validation: 37 samples
  Test: 39 samples


## 7. Data Collator


In [32]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        import torch

        # Extract input_values and labels - explicitly ignore attention_mask if present
        input_values_list = [feature["input_values"] for feature in features]
        label_features = [feature["labels"] for feature in features]

        # Convert audio to tensors and pad manually
        input_values_tensors = []
        for iv in input_values_list:
            if isinstance(iv, torch.Tensor):
                input_values_tensors.append(iv)
            elif isinstance(iv, np.ndarray):
                input_values_tensors.append(torch.tensor(iv, dtype=torch.float32))
            else:
                input_values_tensors.append(torch.tensor(np.array(iv), dtype=torch.float32))

        # Pad audio sequences
        input_values = torch.nn.utils.rnn.pad_sequence(
            input_values_tensors,
            batch_first=True,
            padding_value=0.0
        )

        # Pad labels manually
        max_label_len = max(len(labels) for labels in label_features)
        pad_token_id = self.processor.tokenizer.pad_token_id

        padded_labels = []
        for labels in label_features:
            if isinstance(labels, torch.Tensor):
                labels = labels.tolist()

            padding_length = max_label_len - len(labels)
            padded = labels + [pad_token_id] * padding_length
            padded_labels.append(padded)

        labels_tensor = torch.tensor(padded_labels, dtype=torch.long)

        # Mask padded labels with -100
        attention_mask_labels = (labels_tensor != pad_token_id).long()
        labels_tensor = labels_tensor.masked_fill(attention_mask_labels.ne(1), -100)

        # Return ONLY input_values and labels - explicitly create a new dict
        batch = {}
        batch["input_values"] = input_values
        batch["labels"] = labels_tensor

        # Explicitly ensure no other keys are present
        return batch

# Add these lines:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
print("Data collator created")

Data collator created


## 8. Evaluation Metrics


In [33]:
def compute_metrics(pred):
    """Compute WER (Word Error Rate) and CER (Character Error Rate)"""
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = jiwer.wer(label_str, pred_str)
    cer = jiwer.cer(label_str, pred_str)

    return {"wer": wer, "cer": cer}

print("Evaluation metrics function created")


Evaluation metrics function created


## 9. Training Arguments and Trainer


In [34]:
training_args = TrainingArguments(**TRAINING_ARGS)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("Trainer initialized")
print(f"\nTraining configuration:")
print(f"  Max steps: {training_args.max_steps}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Evaluation steps: {training_args.eval_steps}")


Trainer initialized

Training configuration:
  Max steps: 800
  Batch size: 4
  Gradient accumulation: 4
  Effective batch size: 16
  Learning rate: 0.0003
  Evaluation steps: 400


In [35]:
# Check if data collator is working correctly
sample_features = [train_dataset[0], train_dataset[1]]
sample_batch = data_collator(sample_features)
print("Batch keys:", list(sample_batch.keys()))
print("Batch has input_ids:", "input_ids" in sample_batch)
print("Batch has input_values:", "input_values" in sample_batch)
print("Batch has labels:", "labels" in sample_batch)

# Try to manually call the model to see what happens
model.eval()
with torch.no_grad():
    try:
        output = model(**sample_batch)
        print("Model forward works!")
    except Exception as e:
        print(f"Model forward error: {e}")

Batch keys: ['input_values', 'labels']
Batch has input_ids: False
Batch has input_values: True
Batch has labels: True
Model forward error: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor


## 10. Train Model


In [36]:
train_result = trainer.train()

# Save final model
final_model_path = f"{OUTPUT_DIR}_final"
trainer.save_model(final_model_path)
processor.save_pretrained(final_model_path)

print("\nTraining completed!")
print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Final model saved to: {final_model_path}")


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss,Wer,Cer
400,6.1745,5.653153,1.0,0.968062
800,4.9072,4.723449,1.0,0.913849



Training completed!
Training loss: 11.4706
Final model saved to: wav2vec2_lora_amharic_legal_v2_final


## 11. Zip and Copy Model to Google Drive


In [42]:
import shutil
import zipfile
from pathlib import Path

# Zip the final model directory
zip_filename = f"{final_model_path}.zip"
print(f"Creating zip file: {zip_filename}...")

with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as zipf:
    for file_path in Path(final_model_path).rglob("*"):
        if file_path.is_file():
            # Get relative path for archive
            arcname = file_path.relative_to(final_model_path)
            zipf.write(file_path, arcname)
            print(f"  Added: {arcname}")

print(f"\nZip file created: {zip_filename}")

# Copy to Google Drive
drive_dest = f"/content/drive/MyDrive/{zip_filename}"
shutil.copy2(zip_filename, drive_dest)

print(f"Model zip file copied to Google Drive: {drive_dest}")
print(f"\nFile size: {Path(zip_filename).stat().st_size / (1024*1024):.2f} MB")


Creating zip file: wav2vec2_lora_amharic_legal_v2_final.zip...
  Added: added_tokens.json
  Added: preprocessor_config.json
  Added: special_tokens_map.json
  Added: tokenizer_config.json
  Added: adapter_config.json
  Added: training_args.bin
  Added: adapter_model.safetensors
  Added: README.md
  Added: vocab.json

Zip file created: wav2vec2_lora_amharic_legal_v2_final.zip
Model zip file copied to Google Drive: /content/drive/MyDrive/wav2vec2_lora_amharic_legal_v2_final.zip

File size: 6.21 MB


## 12. Final Evaluation on Test Set


In [38]:
final_model_path = f"{OUTPUT_DIR}_final"
trainer.save_model(final_model_path)
processor.save_pretrained(final_model_path)

print(f"Final model saved to: {final_model_path}")
print("\nFiles saved:")
print("  - LoRA adapters (adapter_model.bin, adapter_config.json)")
print("  - Processor (tokenizer, feature_extractor)")
print("  - Training configuration")


Final model saved to: wav2vec2_lora_amharic_legal_v2_final

Files saved:
  - LoRA adapters (adapter_model.bin, adapter_config.json)
  - Processor (tokenizer, feature_extractor)
  - Training configuration


## 13. Inference Example


In [39]:
def transcribe_audio(model, processor, audio_path):
    """Transcribe a single audio file"""
    speech, _ = librosa.load(audio_path, sr=16000)

    inputs = processor(
        speech,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    )

    if torch.cuda.is_available():
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(inputs.input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

print("Inference function created")
print("\nExample usage:")
print("  transcription = transcribe_audio(model, processor, 'path/to/audio.mp3')")
print("  print(transcription)")


Inference function created

Example usage:
  transcription = transcribe_audio(model, processor, 'path/to/audio.mp3')
  print(transcription)


## 14. Load Model for Inference (After Training)


In [40]:
from peft import PeftModel

def load_trained_model(base_model_name, adapter_path, processor_path):
    """Load base model and LoRA adapters"""
    processor = Wav2Vec2Processor.from_pretrained(processor_path)

    base_model = Wav2Vec2ForCTC.from_pretrained(
        base_model_name,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
        vocab_size=len(processor.tokenizer)
    )

    model = PeftModel.from_pretrained(base_model, adapter_path)

    if torch.cuda.is_available():
        model = model.to("cuda")

    model.eval()
    return model, processor

print("Model loading function created")
print("\nExample usage:")
print("  model, processor = load_trained_model(")
print("      MODEL_NAME,")
print(f"      '{OUTPUT_DIR}_final',")
print(f"      '{OUTPUT_DIR}_final'")
print("  )")


Model loading function created

Example usage:
  model, processor = load_trained_model(
      MODEL_NAME,
      'wav2vec2_lora_amharic_legal_v2_final',
      'wav2vec2_lora_amharic_legal_v2_final'
  )
