# Fine-Tune Whisper For Igbo ASR with 🤗 Transformers

## Prepare Environment

We can verify that we've been assigned a GPU and view its specifications:

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Mar 26 02:03:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
!pip install --upgrade --quiet pip
!pip install --upgrade --quiet datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load Dataset

In [5]:
from datasets import load_dataset, concatenate_datasets, DatasetDict, Audio

english_dataset = DatasetDict()
hausa_dataset = DatasetDict()
igbo_dataset = DatasetDict()
yoruba_dataset = DatasetDict()


english_dataset["train"] = load_dataset("benjaminogbonna/nigerian_common_voice_dataset", "english", split="test")
english_dataset["test"] = load_dataset("benjaminogbonna/nigerian_common_voice_dataset", "english", split="validation")

hausa_dataset["train"] = load_dataset("benjaminogbonna/nigerian_common_voice_dataset", "hausa", split="test")
hausa_dataset["test"] = load_dataset("benjaminogbonna/nigerian_common_voice_dataset", "hausa", split="validation")

igbo_dataset["train"] = load_dataset("benjaminogbonna/nigerian_common_voice_dataset", "igbo", split="test")
igbo_dataset["test"] = load_dataset("benjaminogbonna/nigerian_common_voice_dataset", "igbo", split="validation")

yoruba_dataset["train"] = load_dataset("benjaminogbonna/nigerian_common_voice_dataset", "yoruba", split="test")
yoruba_dataset["test"] = load_dataset("benjaminogbonna/nigerian_common_voice_dataset", "yoruba", split="validation")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
english_dataset = english_dataset.remove_columns(["client_id", "accent", "locale", "path"])
hausa_dataset = hausa_dataset.remove_columns(["client_id", "accent", "locale", "path"])
igbo_dataset = igbo_dataset.remove_columns(["client_id", "accent", "locale", "path"])
yoruba_dataset = yoruba_dataset.remove_columns(["client_id", "accent", "locale", "path"])

In [22]:
print(igbo_dataset)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'locale'],
        num_rows: 572
    })
    test: Dataset({
        features: ['audio', 'sentence', 'locale'],
        num_rows: 571
    })
})


### Prepare Data

Let's print the first example of the Common Voice dataset to see
what form the data is in:

In [23]:
print(igbo_dataset["train"][0])

{'audio': {'path': 'ng_voice_igbo_1003.mp3', 'array': array([ 0.00000000e+00,  4.36533405e-14,  1.22015137e-13, ...,
       -4.47112788e-03, -2.65926821e-03, -1.13113690e-03]), 'sampling_rate': 32000}, 'sentence': 'a hụ ndị ogbe ka ha fibere mgbe n’ama', 'locale': 'igbo'}


In [7]:
from datasets import Audio

english_dataset = english_dataset.cast_column("audio", Audio(sampling_rate=16000))
hausa_dataset = hausa_dataset.cast_column("audio", Audio(sampling_rate=16000))
igbo_dataset = igbo_dataset.cast_column("audio", Audio(sampling_rate=16000))
yoruba_dataset = yoruba_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [25]:
print(igbo_dataset["train"][0])

{'audio': {'path': 'ng_voice_igbo_1003.mp3', 'array': array([-5.00222086e-12, -9.09494702e-13, -9.09494702e-12, ...,
       -2.91874446e-03, -4.53312509e-03, -2.97277421e-03]), 'sampling_rate': 16000}, 'sentence': 'a hụ ndị ogbe ka ha fibere mgbe n’ama', 'locale': 'igbo'}


In [32]:
# Add a language column to each dataset

english_dataset = english_dataset.map(lambda x: {"language": "english"})
hausa_dataset = hausa_dataset.map(lambda x: {"language": "hausa"})
igbo_dataset = igbo_dataset.map(lambda x: {"language": "igbo"})
yoruba_dataset = yoruba_dataset.map(lambda x: {"language": "yoruba"})

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/340 [00:00<?, ? examples/s]

Map:   0%|          | 0/901 [00:00<?, ? examples/s]

Map:   0%|          | 0/901 [00:00<?, ? examples/s]

Map:   0%|          | 0/572 [00:00<?, ? examples/s]

Map:   0%|          | 0/571 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

Map:   0%|          | 0/417 [00:00<?, ? examples/s]

In [8]:
# Combine the datasets

combined_train  = concatenate_datasets([english_dataset['train'], hausa_dataset['train'], igbo_dataset['train'], yoruba_dataset['train']])
combined_test  = concatenate_datasets([english_dataset['test'], hausa_dataset['test'], igbo_dataset['test'], yoruba_dataset['test']])

Re-loading the first audio sample in the Common Voice dataset will resample
it to the desired sampling rate:

In [9]:
print(combined_train[0])

{'audio': {'path': 'ng_voice_english_1000.mp3', 'array': array([-2.86213044e-08, -7.97070854e-08, -9.06553979e-08, ...,
        3.36406356e-06,  3.90603145e-06, -1.83695943e-06]), 'sampling_rate': 16000}, 'sentence': 'It is thirty five degrees with drizzle in Uyo'}


## Prepare Feature Extractor, Tokenizer and Data

In [10]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")

### Load WhisperTokenizer

In [12]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="English", task="transcribe")

### Combine To Create A WhisperProcessor

In [13]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="English", task="transcribe")

In [23]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"],
                                truncation=True,
                                max_length=448,
                                padding="max_length",
                                return_attention_mask=True
                                ).input_ids

    return batch

In [24]:
# igbo_dataset = igbo_dataset.map(prepare_dataset, remove_columns=igbo_dataset.column_names["train"], num_proc=2)

# Apply preprocessing
# combined_dataset = combined_dataset.map(prepare_dataset, remove_columns=combined_dataset.column_names["train"], num_proc=2)

combined_train = combined_train.map(prepare_dataset, remove_columns=combined_train.column_names, num_proc=2)

Map (num_proc=2):   0%|          | 0/2232 [00:00<?, ? examples/s]

In [25]:
combined_test = combined_test.map(prepare_dataset, remove_columns=combined_test.column_names, num_proc=2)

Map (num_proc=2):   0%|          | 0/2229 [00:00<?, ? examples/s]

## Training and Evaluation

### Load a Pre-Trained Checkpoint

In [27]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

In [28]:
model.generation_config.language = "english"
model.generation_config.task = "transcribe"

# model.generation_config.forced_decoder_ids = None

### Define a Data Collator

In [29]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

Let's initialise the data collator we've just defined:

In [30]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

### Evaluation Metrics

We'll use the word error rate (WER) metric, the 'de-facto' metric for assessing
ASR systems. For more information, refer to the WER [docs](https://huggingface.co/metrics/wer). We'll load the WER metric from 🤗 Evaluate:

In [31]:
import evaluate

metric = evaluate.load("wer")

In [32]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

### Define the Training Configuration

In the final step, we define all the parameters related to training. For more detail on the training arguments, refer to the Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments).

In [33]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-for-nigerian-common-languages",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=1000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=100,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)



**Note**: if one does not want to upload the model checkpoints to the Hub,
set `push_to_hub=False`.

We can forward the training arguments to the 🤗 Trainer along with our model,
dataset, data collator and `compute_metrics` function:

In [34]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=combined_train,
    eval_dataset=combined_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(


We'll save the processor object once before starting training. Since the processor is not trainable, it won't change over the course of training:

In [36]:
processor.save_pretrained(training_args.output_dir)

[]

### Training

```javascript
function ConnectButton(){
    console.log("Connect pushed");
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
}
setInterval(ConnectButton, 60000);
```

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Wer
100,4.1555,0.237286,106.521739
200,0.1875,0.162103,101.665565
300,0.1453,0.133564,89.985948
400,0.1188,0.118646,92.72607
500,0.0927,0.109147,84.923128
600,0.0926,0.103155,80.810878


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [None]:
model.save_pretrained("./whisper-base-for-nigerian-common-languages")
# processor.save_pretrained("./whisper-small-igbo")

Our best WER is 32.0% - not bad for 8h of training data! We can make our model more accessible on the Hub with appropriate tags and README information.
You can change these values to match your dataset, language and model
name accordingly:

In [1]:
kwargs = {
    "dataset_tags": "benjaminogbonna/nigerian_common_voice_dataset",
    "dataset": "benjaminogbonna/nigerian_common_voice_dataset",
    "dataset_args": "config: english, split: train+test; config: hausa, split: train+test; config: igbo, split: train+test; config: yoruba, split: train+test",
    "language": ["english", "hausa", "igbo", "yoruba"],
    "model_name": "Whisper tiny for Nigerian Common Languages",
    "finetuned_from": "openai/whisper-tiny",
    "tasks": "automatic-speech-recognition",
}

The training results can now be uploaded to the Hub. To do so, execute the `push_to_hub` command and save the preprocessor object we created:

In [2]:
trainer.push_to_hub(**kwargs)

NameError: name 'trainer' is not defined

In [None]:
# Inference

import torch
from transformers import pipeline

# Load the fine-tuned model and processor
model = WhisperForConditionalGeneration.from_pretrained("./whisper-small-enwazobia")
processor = WhisperProcessor.from_pretrained("./whisper-small-enwazobia")

# Create a pipeline for inference
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)

# Transcribe new audio
audio_input = "igbo_test1.mp3"
transcription = pipe(audio_input)
print(transcription)

Device set to use cuda:0


{'text': 'isi obodo steeti Delta'}


In [None]:
# Transcribe new audio
audio_input = "igbo_test2.mp3"
transcription = pipe(audio_input)
print(transcription['text'])



Odeakwụkwọ ọkpụtọrọkpụ ụlọọrụ na-ahụ maka ọrụ ngo na steeti Anambra


In [None]:
model_from_hub = WhisperForConditionalGeneration.from_pretrained("benjaminogbonna/whisper-small-enwazobia")
processor_from_hub = WhisperProcessor.from_pretrained("benjaminogbonna/whisper-small-enwazobia")

pipeline2 = pipeline("automatic-speech-recognition", model="benjaminogbonna/whisper-small-enwazobia")

audio_input = "igbo_test2.mp3"
transcription = pipeline2(audio_input)
print(transcription['text'])

Device set to use cuda:0


Odeakwụkwọ ọkpụtọrọkpụ ụlọọrụ na-ahụ maka ọrụ ngo na steeti Anambra


In [None]:
audio_input = "igbo_test1.mp3"
transcription = pipeline2(audio_input)
print(transcription['text'])

isi obodo steeti Delta




In [None]:
# Odeakwụkwọ ọkpụtọrọkpụ ụlọọrụ na-ahụ maka ọrụ ngo na steeti Anambra

## Building a Demo

In [None]:
from transformers import pipeline
import gradio as gr

pipe = pipeline(model="benjaminogbonna/whisper-small-enwazobia")  # change to "your-username/the-name-you-picked"

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs="text",
    title="Whisper Small enwazobia",
    description="Realtime demo for English, Igbo, Hausa, Yoruba speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

Device set to use cuda:0


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://711d243001fe1dc404.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# Another try

In [None]:
from datasets import Audio

# Resample audio to 16kHz (Whisper's expected input)
igbo_dataset = igbo_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
from transformers import WhisperProcessor

# Load the Whisper tiny processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="en", task="transcribe")

from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [None]:
def prepare_dataset(batch):
    # Load and resample audio
    audio = batch["audio"]

    # Compute log-Mel spectrogram input features
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # Tokenize the transcriptions
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids

    return batch

In [None]:
igbo_dataset = igbo_dataset.map(prepare_dataset, remove_columns=igbo_dataset.column_names["train"], num_proc=2)

Map (num_proc=2):   0%|          | 0/11 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/4 [00:00<?, ? examples/s]

## Load the Whisper Tiny Model for fine-tuning:

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

In [None]:
# Use the Trainer API from Hugging Face to fine-tune the model:

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Define training arguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-igbo",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=3e-5,
    warmup_steps=100,
    max_steps=500,
    fp16=True,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=100,
    save_steps=100,
    logging_dir="./logs",
    logging_steps=100,
    report_to="tensorboard",
    save_total_limit=2,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)



In [None]:
# Define the Trainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=igbo_dataset["train"],
    eval_dataset=igbo_dataset["test"],
    tokenizer=processor.tokenizer,
)

  trainer = Seq2SeqTrainer(


In [None]:
# Start training

trainer.train()

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['input_features', 'labels']

In [None]:
# After training, evaluate the model on the test set

import evaluate

# Load the WER metric
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Decode predictions and labels
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Evaluate
results = trainer.evaluate(eval_dataset, metric_key_prefix="eval")
print(results)

In [None]:
!pip install --upgrade --quiet datasets[audio]

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-runtime-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-runtime-cu12 12.

In [None]:
from datasets import Dataset, Audio, DatasetDict, load_dataset

In [None]:
common_voice = DatasetDict()

common_voice["train"] = load_dataset("benjaminogbonna/nigerian_accented_english_dataset", split="train+validation")
common_voice["test"] = load_dataset("benjaminogbonna/nigerian_accented_english_dataset", split="test")

print(common_voice)

README.md:   0%|          | 0.00/9.14k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/98.0M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/11.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2721 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/340 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/341 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'client_id', 'path', 'sentence', 'accent', 'locale', 'segment'],
        num_rows: 3061
    })
    test: Dataset({
        features: ['audio', 'client_id', 'path', 'sentence', 'accent', 'locale', 'segment'],
        num_rows: 341
    })
})


In [None]:
common_voice = common_voice.remove_columns(["accent", "client_id", "locale", "segment", 'path'])

In [None]:
print(common_voice["train"][1])

{'audio': {'path': 'audio_sample_100.mp3', 'array': array([-2.39627640e-04, -1.97531001e-04, -7.99428453e-05, ...,
        1.38503383e-05, -2.25158510e-06, -7.35291833e-05]), 'sampling_rate': 16000}, 'sentence': 'Head south on Ibo Road towards Emir Road'}


In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="English", task="transcribe")

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="English", task="transcribe")

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/36 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4 [00:00<?, ? examples/s]

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

In [None]:
model.generation_config.language = "english"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [None]:
import evaluate

metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-ig2",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=1,
    max_steps=2,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1,
    eval_steps=1,
    logging_steps=1,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Wer
1,1.4444,1.539057,55.172414
2,1.5221,1.506303,45.977011


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=2, training_loss=1.483235478401184, metrics={'train_runtime': 201.2837, 'train_samples_per_second': 0.159, 'train_steps_per_second': 0.01, 'total_flos': 787804323840000.0, 'train_loss': 1.483235478401184, 'epoch': 0.6666666666666666})

In [None]:
model.save_pretrained("./whisper-tiny-ig2")
processor.save_pretrained("./whisper-tiny-ig2")

[]

In [None]:
# Inference

import torch
from transformers import pipeline

# Load the fine-tuned model and processor
model = WhisperForConditionalGeneration.from_pretrained("./whisper-tiny-ig2")
processor = WhisperProcessor.from_pretrained("./whisper-tiny-ig2")

# Create a pipeline for inference
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)

# Transcribe new audio
audio_input = "igbo_test2.mp3"
transcription = pipe(audio_input)
print(transcription)

Device set to use cpu


{'text': ' Thank you for watching.'}


In [None]:
# Inference

import torch
from transformers import pipeline

# Load the fine-tuned model and processor
model = WhisperForConditionalGeneration.from_pretrained("benjaminogbonna/whisper-tiny-igbo")
processor = WhisperProcessor.from_pretrained("benjaminogbonna/whisper-tiny-igbo")

# Create a pipeline for inference
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)

# Transcribe new audio
audio_input = "igbo_test2.mp3"
transcription = pipe(audio_input)
print(transcription)

Device set to use cpu


{'text': 'Odeakwụkwọ ọkpụtọrọkpụ ụlọọrụ na-ahụ maka ọrụ ngo na steeti Anambra'}
