# Fine-Tune Whisper

Adapted from guide here: https://huggingface.co/blog/fine-tune-whisper

## Load Dataset

In [1]:
from datasets import load_from_disk

dataset = load_from_disk('amdo-ds')

dataset

DatasetDict({
    train: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__'],
        num_rows: 12750
    })
    test: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__'],
        num_rows: 2250
    })
})

## Prepare Feature Extractor, Tokenizer and Data

In [2]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="bo", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="bo", task="transcribe")

### Prepare Data

In [3]:
import librosa
import requests
import tempfile

def prepare_dataset(batch):
    # Download audio from URL
    response = requests.get(batch["url"])
    response.raise_for_status()

    # Save to temporary WAV file
    with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
        tmp.write(response.content)
        tmp.flush()

        # Load and resample audio using librosa
        waveform, sr = librosa.load(tmp.name, sr=16000)

    # Feature extraction
    batch["input_features"] = feature_extractor(
        waveform, sampling_rate=16000
    ).input_features[0]

    # Tokenize transcription
    batch["labels"] = tokenizer(
        batch["uni"], max_length=448, truncation=True
    ).input_ids

    return batch



In [4]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=6)

Map (num_proc=6):   0%|          | 0/12750 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2250 [00:00<?, ? examples/s]

In [5]:
dataset.save_to_disk('processed-amdo-ds')

Saving the dataset (0/25 shards):   0%|          | 0/12750 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/2250 [00:00<?, ? examples/s]