# Fine-Tune Whisper

Adapted from guide here: https://huggingface.co/blog/fine-tune-whisper

## Load Dataset

In [1]:
from datasets import load_from_disk

dataset = load_from_disk('tt-asr-ds')

dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len'],
        num_rows: 379522
    })
    dev: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len'],
        num_rows: 22324
    })
    test: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len'],
        num_rows: 44649
    })
})

## Prepare Feature Extractor, Tokenizer and Data

In [2]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="bo", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="bo", task="transcribe")

2025-06-03 18:12:57.386889: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-03 18:12:57.489640: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748988777.530377    5067 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748988777.543333    5067 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748988777.627615    5067 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

### Prepare Data

In [4]:
import librosa
import requests
import tempfile

def prepare_dataset(batch):
    # Download audio from URL
    response = requests.get(batch["url"])
    response.raise_for_status()

    # Save to temporary WAV file
    with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
        tmp.write(response.content)
        tmp.flush()

        # Load and resample audio using librosa
        waveform, sr = librosa.load(tmp.name, sr=16000)

    # Feature extraction
    batch["input_features"] = feature_extractor(
        waveform, sampling_rate=16000
    ).input_features[0]

    # Tokenize transcription
    batch["labels"] = tokenizer(
        batch["uni"], max_length=448, truncation=True
    ).input_ids

    return batch



In [None]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=4)

Map (num_proc=4):   6%|▌         | 20996/379522 [11:36<10:01:26,  9.94 examples/s]

In [None]:
dataset.save_to_disk('processed-ds')