# Common Voice Dataset Training with Whisper

This notebook demonstrates how to train or fine-tune the Whisper model using the Common Voice dataset.

In [1]:
%pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio whisper

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting tensorboard
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting whisper
  Downloading whisper-1.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets[audio]
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets[audio])
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets[audio])
  Downloading multiproce

In [2]:
class Hyperparameters:
    def __init__(self):
        self.learning_rate = 0.001
        self.batch_size = 32
        self.num_epochs = 10
        self.dropout_rate = 0.1
        self.weight_decay = 0.0001
        self.max_audio_length = 160000  # Example for 10 seconds of audio at 16kHz
        self.sample_rate = 16000
        self.num_layers = 6
        self.hidden_size = 512
        self.num_attention_heads = 8

    def update(self, **kwargs):
        for key, value in kwargs.items():
            if hasattr(self, key):
                setattr(self, key, value)

    def display(self):
        for key, value in self.__dict__.items():
            print(f"{key}: {value}")

In [3]:
import whisper
import torch
import torchaudio

class WhisperModel:
    def __init__(self, hyperparameters):
        self.hyperparameters = hyperparameters
        self.model_size = "base"  # tiny, base, small, medium, large
        self.model = None
        self.load_model()

    def load_model(self):
        # Load the Whisper model
        self.model = whisper.load_model(self.model_size)
        return self.model

    def process_audio(self, audio_input):
        # Process the audio input for transcription
        # audio_input can be a path or an audio array
        return audio_input

    def transcribe(self, audio_input):
        # Generate text output from the audio input
        if self.model is None:
            self.load_model()

        if hasattr(audio_input, 'save'):
            # It's a file object from the API
            temp_path = "/tmp/audio_file.wav"
            audio_input.save(temp_path)
            audio_input = temp_path

        if isinstance(audio_input, torch.Tensor):
            # Resample to 16kHz and convert to mono
            sample_rate = 16000
            if audio_input.dim() > 1:  # Convert to mono if stereo
                audio_input = torch.mean(audio_input, dim=0, keepdim=True)
            audio_input = torchaudio.transforms.Resample(orig_freq=audio_input.size(1), new_freq=sample_rate)(audio_input)
            audio_input = audio_input.squeeze().numpy()

        result = self.model.transcribe(audio_input)
        return result['text']

    def tune_hyperparameters(self, new_hyperparameters):
        # Update hyperparameters and reload model if necessary
        self.hyperparameters.update(**new_hyperparameters)
        self.model = self.load_model()

In [4]:
import os
import pandas as pd
from tqdm import tqdm
import torch
from torch.utils.data import Dataset

class CommonVoiceDataset(Dataset):
    """Common Voice dataset loader.

    This dataset loader handles the Common Voice format with TSV files
    and audio clips stored in the 'clips' directory.
    """

    def __init__(self, root_dir, tsv_file, transform=None):
        """
        Args:
            root_dir (str): Directory containing the 'clips' folder
            tsv_file (str): Path to the TSV file with metadata
            transform (callable, optional): Optional transform to be applied on audio
        """
        self.root_dir = root_dir
        self.clips_dir = os.path.join(root_dir, 'clips')
        self.data = pd.read_csv(tsv_file, sep='\t')
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Get the file path and sentence
        file_name = self.data.iloc[idx]['path']
        sentence = self.data.iloc[idx]['sentence']

        # Load audio file
        audio_path = os.path.join(self.clips_dir, file_name)

        # You can use your existing audio loading utility here
        from src.utils.audio import load_audio
        audio, sample_rate = load_audio(audio_path)

        sample = {'audio': audio, 'text': sentence, 'path': audio_path}

        if self.transform:
            sample = self.transform(sample)

        return sample

def load_common_voice_dataset(root_dir, split='train'):
    """
    Load Common Voice dataset for a specific split

    Args:
        root_dir (str): Base directory containing Common Voice data
        split (str): One of 'train', 'dev', 'test'

    Returns:
        CommonVoiceDataset: Dataset instance for the specified split
    """
    tsv_file = os.path.join(root_dir, f'{split}.tsv')
    return CommonVoiceDataset(root_dir, tsv_file)

def process_common_voice_metadata(tsv_file, min_duration=1, max_duration=10):
    """
    Process Common Voice metadata and filter by duration

    Args:
        tsv_file (str): Path to TSV file
        min_duration (float): Minimum audio duration in seconds
        max_duration (float): Maximum audio duration in seconds

    Returns:
        pd.DataFrame: Filtered dataframe
    """
    df = pd.read_csv(tsv_file, sep='\t')

    # Filter by duration if 'duration' column exists
    if 'duration' in df.columns:
        df = df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)]

    return df


In [6]:
import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, DatasetDict

# Log into HuggingFace

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Import datasets

In [6]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset(
    "mozilla-foundation/common_voice_11_0",
    "es",
    split="train",
    streaming=True
)

common_voice["validation"] = load_dataset(
    "mozilla-foundation/common_voice_11_0",
    "es",
    split="validation",
    streaming=True
)

common_voice["test"] = load_dataset(
    "mozilla-foundation/common_voice_11_0",
    "es",
    split="test",
    streaming=True
)

print(common_voice)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/14.4k [00:00<?, ?B/s]

common_voice_11_0.py:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/60.9k [00:00<?, ?B/s]

The repository for mozilla-foundation/common_voice_11_0 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/mozilla-foundation/common_voice_11_0.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
DatasetDict({
    train: IterableDataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_shards: 6
    })
    validation: IterableDataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_shards: 1
    })
    test: IterableDataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_shards: 1
    })
})


In [7]:
from datasets import concatenate_datasets
common_voice['train_full'] = concatenate_datasets([common_voice['train'], common_voice['validation']])

# Preprocess data

In [8]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Spanish", task="transcribe")

# Get the first element using iteration
first_element = next(iter(common_voice["train_full"]))
print(first_element)

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Reading metadata...: 230467it [00:13, 17490.35it/s]


{'client_id': '34719bb7c7344da7733b85c9d7215d24326093f1a2cd3a445bdc6dfe9ec4a8c9fe9729a73f6c29764545276bff81ffa65d3944f6da7a3ee3c06d0eb124fac797', 'path': 'es_train_0/common_voice_es_18338585.mp3', 'audio': {'path': 'es_train_0/common_voice_es_18338585.mp3', 'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
       -3.25855126e-06, -3.52389725e-06, -3.05285812e-06]), 'sampling_rate': 48000}, 'sentence': '¿ Qué tal a tres de cinco ?', 'up_votes': 2, 'down_votes': 1, 'age': '', 'gender': '', 'accent': '', 'locale': 'es', 'segment': ''}


In [9]:
from datasets import Audio
columns_to_remove = ['accent', 'age', 'client_id', 'down_votes', 'gender', 'locale', 'path', 'segment', 'up_votes']

for split in common_voice:
    common_voice[split] = common_voice[split].remove_columns(columns_to_remove)

for split in common_voice:
    common_voice[split] = common_voice[split].cast_column("audio", Audio(sampling_rate=16000))

first_sample = next(iter(common_voice["train"]))
print(first_sample)

Reading metadata...: 230467it [00:05, 41321.52it/s]


{'audio': {'path': None, 'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        2.06304139e-06, -1.93879569e-05, -3.51465860e-05]), 'sampling_rate': 16000}, 'sentence': '¿ Qué tal a tres de cinco ?'}


In [10]:
from transformers import WhisperProcessor
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small", language="Spanish", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Spanish", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Spanish", task="transcribe")

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [11]:
columns_to_remove = common_voice["train_full"].column_names

for split in common_voice:
    # 1) map your preprocessing
    common_voice[split] = common_voice[split].map(
        prepare_dataset
    )
    # 2) drop the unwanted columns
    common_voice[split] = common_voice[split].remove_columns(
        columns_to_remove
    )


# Training

In [12]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.generation_config.language = "es"
model.generation_config.task = "transcribe"
model.generation_config.force_decoder_ids = None

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [13]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [14]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [15]:
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [19]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-hi",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=100,
    gradient_checkpointing=True,
    fp16=True,
    # evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    # load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)


In [20]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  trainer = Seq2SeqTrainer(


In [21]:
trainer.train()

Reading metadata...: 230467it [00:05, 40673.88it/s]


Step,Training Loss
25,0.3553
50,0.3295
75,0.2732
100,0.2941




TrainOutput(global_step=100, training_loss=0.31301586151123045, metrics={'train_runtime': 311.179, 'train_samples_per_second': 5.142, 'train_steps_per_second': 0.321, 'total_flos': 4.61736640512e+17, 'train_loss': 0.31301586151123045, 'epoch': 1.0})

# Upload to HuggingFace so it can be reused

In [30]:
kwargs = {
    "dataset_tags": "mozilla-foundation/common_voice_11_0",
    "dataset": "Common Voice 11.0",
    "dataset_args": "",
    "language": "es",
    "model_name": "VoxLens - OpenAI Whisper Small Spanish",
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
}

# Save & push model
trainer.push_to_hub(
    "brauliodev/voxlens",
    **kwargs,
)

# Save & push processor
processor.push_to_hub(
    "brauliodev/voxlens"
)


No files have been modified since last commit. Skipping to prevent empty commit.


README.md:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/brauliodev/voxlens/commit/fda1e80c2f2a89eb6675007c160fb9c5c46c02ca', commit_message='Upload processor', commit_description='', oid='fda1e80c2f2a89eb6675007c160fb9c5c46c02ca', pr_url=None, repo_url=RepoUrl('https://huggingface.co/brauliodev/voxlens', endpoint='https://huggingface.co', repo_type='model', repo_id='brauliodev/voxlens'), pr_revision=None, pr_num=None)

# Test the model

In [31]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor

model = WhisperForConditionalGeneration.from_pretrained("brauliodev/voxlens")
processor = WhisperProcessor.from_pretrained("brauliodev/voxlens")

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [45]:
from transformers import pipeline
import gradio as gr

model.generation_config.forced_decoder_ids = None

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    processor=processor,
    tokenizer=processor.tokenizer, # Explicitly pass the tokenizer
    feature_extractor=processor.feature_extractor, # Explicitly pass feature extractor
    device=0,
    chunk_length_s=30,
)

def transcribe(audio_filepath):
    result = pipe(audio_filepath)
    return result["text"]

Device set to use cuda:0


In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
print(transcribe("/content/drive/MyDrive/test.wav"))

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensL

Mi nombre es Renata.Soy estudiante en una universidad en México.


In [47]:
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(
        label="Upload an audio file",
        type="filepath"
    ),
    outputs=gr.Textbox(
        label="Transcription"
    ),
    title="VoxLens Whisper Small (Spanish)",
    description="Fine-tuned on Common Voice 11.0 (es). Upload a clip to transcribe.",
)

iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dfcd89a4b18bb644f6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


