In [None]:
pip install torch==2.6.0+cu124 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

In [None]:
pip install datasets==2.16.0

In [None]:
pip install transformers==4.52.0

In [None]:
pip install scikit-learn accelerate

In [None]:
pip install --upgrade evaluate jiwer

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
import evaluate
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from functools import partial

from datasets import load_dataset, DatasetDict
from datasets import Audio

from transformers import WhisperProcessor
from transformers import WhisperForConditionalGeneration
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

In [10]:
minds_14 = DatasetDict()

minds_14["train"] = load_dataset(
    "PolyAI/minds14", "en-US", split="train[:450]"
)
minds_14["test"] = load_dataset(
    "PolyAI/minds14", "en-US", split="train[450:]"
)

print(minds_14)

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 450
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 113
    })
})


In [12]:
minds_14 = minds_14.select_columns(["audio", "transcription"])

In [14]:
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-tiny", language="en", task="transcribe"
)

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [16]:
minds_14["train"].features

{'audio': Audio(sampling_rate=8000, mono=True, decode=True, id=None),
 'transcription': Value(dtype='string', id=None)}

In [17]:
sampling_rate = processor.feature_extractor.sampling_rate
minds_14 = minds_14.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [19]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        audio=audio["array"],
        sampling_rate=audio["sampling_rate"],
        text=example["transcription"],
    )

    # compute input length of audio sample in seconds
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]

    return example

In [20]:
minds_14 = minds_14.map(
    prepare_dataset, remove_columns=minds_14.column_names["train"], num_proc=1
)

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

In [24]:
max_input_length = 30.0
def is_audio_in_length_range(length):
    return length < max_input_length

In [25]:
minds_14["train"] = minds_14["train"].filter(
    is_audio_in_length_range,
    input_columns=["input_length"],
)

Filter:   0%|          | 0/450 [00:00<?, ? examples/s]

In [26]:
minds_14["train"]

Dataset({
    features: ['input_features', 'labels', 'input_length'],
    num_rows: 445
})

In [27]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [
            {"input_features": feature["input_features"][0]} for feature in features
        ]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [28]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [29]:
import evaluate
metric = evaluate.load("wer")

Downloading builder script: 0.00B [00:00, ?B/s]

In [30]:
normalizer = BasicTextNormalizer()

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # compute orthographic wer
    wer_ortho = metric.compute(predictions=pred_str, references=label_str)

    # compute normalised WER
    pred_str_norm = [normalizer(pred) for pred in pred_str]
    label_str_norm = [normalizer(label) for label in label_str]
    # filtering step to only evaluate the samples that correspond to non-zero references:
    pred_str_norm = [
        pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0
    ]
    label_str_norm = [
        label_str_norm[i]
        for i in range(len(label_str_norm))
        if len(label_str_norm[i]) > 0
    ]

    wer = metric.compute(predictions=pred_str_norm, references=label_str_norm)

    return {"wer_ortho": wer_ortho, "wer": wer}

In [31]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

In [32]:
# disable cache during training since it's incompatible with gradient checkpointing
model.config.use_cache = False

# set language and task for generation and re-enable cache
model.generate = partial(
    model.generate, language="english", task="transcribe", use_cache=True
)

In [33]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-en",  # name on the HF Hub
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_steps=50,
    max_steps=500,  # increase to 4000 if you have your own GPU or a Colab paid plan
    gradient_checkpointing=True,
    fp16=True,
    fp16_full_eval=True,
    eval_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [34]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=minds_14["train"],
    eval_dataset=minds_14["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor,
)

In [35]:
trainer.train()

Step,Training Loss,Validation Loss,Wer Ortho,Wer
500,0.0006,0.664862,0.324491,0.324675


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=500, training_loss=0.2800298692956567, metrics={'train_runtime': 1228.3005, 'train_samples_per_second': 6.513, 'train_steps_per_second': 0.407, 'total_flos': 1.9569551781888e+17, 'train_loss': 0.2800298692956567, 'epoch': 17.857142857142858})

In [36]:
kwargs = {
    "dataset_tags": "PolyAI/minds14",
    "finetuned_from": "openai/whisper-tiny",
    "tasks": "automatic-speech-recognition",
}

In [37]:
trainer.push_to_hub(**kwargs)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...t/whisper-tiny-en/training_args.bin: 100%|##########| 5.43kB / 5.43kB            

  ...vents.1755607780.f508dd46cacf.290.0: 100%|##########| 11.6kB / 11.6kB            

  ...t/whisper-tiny-en/model.safetensors:  28%|##7       | 41.9MB /  151MB            

CommitInfo(commit_url='https://huggingface.co/arsonor/whisper-tiny-en/commit/1928743a3239b980dde1925f0504d1f0a35471e4', commit_message='End of training', commit_description='', oid='1928743a3239b980dde1925f0504d1f0a35471e4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/arsonor/whisper-tiny-en', endpoint='https://huggingface.co', repo_type='model', repo_id='arsonor/whisper-tiny-en'), pr_revision=None, pr_num=None)