# Fine-tuning Wav2Vec2 for Amharic Legal ASR with LoRA

This notebook fine-tunes `agkphysics/wav2vec2-large-xlsr-53-amharic` for legal domain Amharic speech recognition using LoRA (Low-Rank Adaptation) for efficient training on Colab free tier.

## Model and Approach
- **Base Model**: `agkphysics/wav2vec2-large-xlsr-53-amharic`
- **Fine-tuning Method**: LoRA (via PEFT library)
- **Task**: Automatic Speech Recognition (ASR)
- **Domain**: Legal Amharic text


## 1. Installation and Setup


In [None]:
%pip install -q transformers datasets accelerate peft torchaudio librosa jiwer soundfile


In [None]:
import os
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Union, Optional
import librosa
import soundfile as sf
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor
)

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType
)

from datasets import Dataset, DatasetDict
import jiwer

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


## 2. Configuration


In [None]:
MODEL_NAME = "agkphysics/wav2vec2-large-xlsr-53-amharic"

AUDIO_DIR = "/content/drive/MyDrive/Dataset_1.5h/audio"
TRAIN_CSV = "/content/drive/MyDrive/Dataset_1.5h/train.csv"
VAL_CSV = "/content/drive/MyDrive/Dataset_1.5h/val.csv"
TEST_CSV = "/content/drive/MyDrive/Dataset_1.5h/test.csv"

OUTPUT_DIR = "wav2vec2_lora_amharic_legal"

LORA_CONFIG = {
    "r": 8,
    "lora_alpha": 32,
    "target_modules": ["q_proj", "v_proj", "k_proj", "out_proj"],
    "lora_dropout": 0.1,
    "bias": "none",
    "task_type": "FEATURE_EXTRACTION"
}

TRAINING_ARGS = {
    "output_dir": OUTPUT_DIR,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 4,
    "learning_rate": 3e-4,
    "warmup_steps": 500,
    "max_steps": 5000,
    "gradient_checkpointing": True,
    "fp16": True,
    "evaluation_strategy": "steps",
    "eval_steps": 500,
    "save_strategy": "steps",
    "save_steps": 500,
    "save_total_limit": 3,
    "load_best_model_at_end": True,
    "metric_for_best_model": "wer",
    "greater_is_better": False,
    "logging_steps": 100,
    "report_to": "none",
    "push_to_hub": False
}

print("Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  LoRA rank (r): {LORA_CONFIG['r']}")
print(f"  LoRA alpha: {LORA_CONFIG['lora_alpha']}")


## 3. Load and Prepare Data


In [None]:
def load_csv_split(csv_path, audio_dir):
    """Load a CSV split and return list of (audio_path, transcription) tuples"""
    df = pd.read_csv(csv_path)
    
    data = []
    for _, row in df.iterrows():
        audio_path = Path(audio_dir) / row['file_name']
        transcription = str(row['transcription']).strip()
        
        if audio_path.exists():
            data.append({
                'audio_path': str(audio_path),
                'transcription': transcription
            })
        else:
            print(f"Warning: Audio file not found: {audio_path}")
    
    return data

train_data = load_csv_split(TRAIN_CSV, AUDIO_DIR)
val_data = load_csv_split(VAL_CSV, AUDIO_DIR)
test_data = load_csv_split(TEST_CSV, AUDIO_DIR)

print(f"Train samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")
print(f"\nTotal samples: {len(train_data) + len(val_data) + len(test_data)}")


## 4. Create Vocabulary and Processor


In [None]:
def extract_vocab_from_transcriptions(data_list):
    """Extract unique characters from all transcriptions"""
    vocab = set()
    for item in data_list:
        vocab.update(item['transcription'])
    return sorted(list(vocab))

vocab = extract_vocab_from_transcriptions(train_data + val_data + test_data)
vocab_dict = {v: k for k, v in enumerate(vocab)}
vocab_dict["|"] = vocab_dict["<unk>"] = len(vocab_dict)
vocab_dict["<pad>"] = len(vocab_dict)

vocab_file = "vocab.json"
with open(vocab_file, 'w', encoding='utf-8') as f:
    import json
    json.dump(vocab_dict, f, ensure_ascii=False, indent=2)

print(f"Vocabulary size: {len(vocab_dict)}")
print(f"First 20 characters: {list(vocab_dict.keys())[:20]}")


In [None]:
tokenizer = Wav2Vec2CTCTokenizer(
    vocab_file,
    unk_token="<unk>",
    pad_token="<pad>",
    word_delimiter_token="|"
)

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=True
)

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
)

print("Processor created successfully")


## 5. Load Model and Apply LoRA


In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    MODEL_NAME,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

print(f"Base model loaded: {MODEL_NAME}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")

if torch.cuda.is_available():
    model = model.to("cuda")
    print("Model moved to CUDA")


In [None]:
lora_config = LoraConfig(
    r=LORA_CONFIG["r"],
    lora_alpha=LORA_CONFIG["lora_alpha"],
    target_modules=LORA_CONFIG["target_modules"],
    lora_dropout=LORA_CONFIG["lora_dropout"],
    bias=LORA_CONFIG["bias"],
    task_type=TaskType.FEATURE_EXTRACTION
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("\nLoRA adapters applied successfully")


## 6. Prepare Dataset


In [None]:
def speech_file_to_array_fn(path):
    """Load audio file and resample to 16kHz"""
    speech_array, sampling_rate = librosa.load(path, sr=16000)
    return speech_array

def prepare_dataset(batch):
    """Process a batch of audio and transcriptions"""
    audio = [speech_file_to_array_fn(path) for path in batch["audio_path"]]
    
    batch["input_values"] = processor(
        audio, 
        sampling_rate=16000, 
        return_tensors="pt", 
        padding=True
    ).input_values
    
    with processor.as_target_processor():
        batch["labels"] = processor(
            batch["transcription"], 
            return_tensors="pt", 
            padding=True
        ).input_ids
    
    return batch

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

train_dataset = train_dataset.map(
    prepare_dataset,
    remove_columns=train_dataset.column_names,
    batch_size=100,
    batched=True
)

val_dataset = val_dataset.map(
    prepare_dataset,
    remove_columns=val_dataset.column_names,
    batch_size=100,
    batched=True
)

test_dataset = test_dataset.map(
    prepare_dataset,
    remove_columns=test_dataset.column_names,
    batch_size=100,
    batched=True
)

print("Datasets prepared:")
print(f"  Train: {len(train_dataset)} samples")
print(f"  Validation: {len(val_dataset)} samples")
print(f"  Test: {len(test_dataset)} samples")


## 7. Data Collator


In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
print("Data collator created")


## 8. Evaluation Metrics


In [None]:
def compute_metrics(pred):
    """Compute WER (Word Error Rate) and CER (Character Error Rate)"""
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = jiwer.wer(label_str, pred_str)
    cer = jiwer.cer(label_str, pred_str)

    return {"wer": wer, "cer": cer}

print("Evaluation metrics function created")


## 9. Training Arguments and Trainer


In [None]:
training_args = TrainingArguments(**TRAINING_ARGS)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.feature_extractor
)

print("Trainer initialized")
print(f"\nTraining configuration:")
print(f"  Max steps: {training_args.max_steps}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Evaluation steps: {training_args.eval_steps}")


## 10. Train Model


In [None]:
train_result = trainer.train()

# Save final model
final_model_path = f"{OUTPUT_DIR}_final"
trainer.save_model(final_model_path)
processor.save_pretrained(final_model_path)

print("\nTraining completed!")
print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Final model saved to: {final_model_path}")


## 11. Zip and Copy Model to Google Drive


In [None]:
import shutil
import zipfile
from pathlib import Path

# Zip the final model directory
zip_filename = f"{final_model_path}.zip"
print(f"Creating zip file: {zip_filename}...")

with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as zipf:
    for file_path in Path(final_model_path).rglob("*"):
        if file_path.is_file():
            # Get relative path for archive
            arcname = file_path.relative_to(final_model_path)
            zipf.write(file_path, arcname)
            print(f"  Added: {arcname}")

print(f"\nZip file created: {zip_filename}")

# Copy to Google Drive
drive_dest = f"/content/drive/MyDrive/{zip_filename}"
shutil.copy2(zip_filename, drive_dest)

print(f"Model zip file copied to Google Drive: {drive_dest}")
print(f"\nFile size: {Path(zip_filename).stat().st_size / (1024*1024):.2f} MB")


## 12. Final Evaluation on Test Set


In [None]:
final_model_path = f"{OUTPUT_DIR}_final"
trainer.save_model(final_model_path)
processor.save_pretrained(final_model_path)

print(f"Final model saved to: {final_model_path}")
print("\nFiles saved:")
print("  - LoRA adapters (adapter_model.bin, adapter_config.json)")
print("  - Processor (tokenizer, feature_extractor)")
print("  - Training configuration")


## 13. Inference Example


In [None]:
def transcribe_audio(model, processor, audio_path):
    """Transcribe a single audio file"""
    speech, _ = librosa.load(audio_path, sr=16000)
    
    inputs = processor(
        speech, 
        sampling_rate=16000, 
        return_tensors="pt", 
        padding=True
    )
    
    if torch.cuda.is_available():
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
    
    with torch.no_grad():
        logits = model(inputs.input_values).logits
    
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    
    return transcription

print("Inference function created")
print("\nExample usage:")
print("  transcription = transcribe_audio(model, processor, 'path/to/audio.mp3')")
print("  print(transcription)")


## 14. Load Model for Inference (After Training)


In [None]:
from peft import PeftModel

def load_trained_model(base_model_name, adapter_path, processor_path):
    """Load base model and LoRA adapters"""
    processor = Wav2Vec2Processor.from_pretrained(processor_path)
    
    base_model = Wav2Vec2ForCTC.from_pretrained(
        base_model_name,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
        vocab_size=len(processor.tokenizer)
    )
    
    model = PeftModel.from_pretrained(base_model, adapter_path)
    
    if torch.cuda.is_available():
        model = model.to("cuda")
    
    model.eval()
    return model, processor

print("Model loading function created")
print("\nExample usage:")
print("  model, processor = load_trained_model(")
print("      MODEL_NAME,")
print(f"      '{OUTPUT_DIR}_final',")
print(f"      '{OUTPUT_DIR}_final'")
print("  )")
