In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00

In [2]:
import os
import torch
from datasets import load_dataset, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

In [3]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [4]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate

In [5]:
# Define parameters
MODEL_ID = 'openai/whisper-small'
OUTPUT_DIR = 'whisper_nepali_model'
EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 1e-5

In [6]:
# Initialize feature extractor, tokenizer, and processor
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_ID)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_ID, language='Nepali', task='transcribe')
processor = WhisperProcessor.from_pretrained(MODEL_ID, language='Nepali', task='transcribe')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [7]:
def load_datasets():
    """Load and preprocess the datasets."""
    train_dataset = load_dataset("fsicoli/common_voice_19_0", "ne-NP", split="train", trust_remote_code=True)
    val_dataset = load_dataset("fsicoli/common_voice_19_0", "ne-NP", split="test", trust_remote_code=True)

    # Resample to 16kHz
    train_dataset = train_dataset.cast_column('audio', Audio(sampling_rate=16000))
    val_dataset = val_dataset.cast_column('audio', Audio(sampling_rate=16000))

    # Remove unnecessary columns
    columns_to_remove = ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
    train_dataset = train_dataset.remove_columns(columns_to_remove)
    val_dataset = val_dataset.remove_columns(columns_to_remove)

    return train_dataset, val_dataset

In [8]:
def prepare_dataset(batch):
    """Prepare a single batch of the dataset."""
    audio = batch['audio']
    batch['input_features'] = feature_extractor(audio['array'], sampling_rate=audio['sampling_rate']).input_features[0]
    batch['labels'] = tokenizer(batch['sentence']).input_ids
    return batch

In [9]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

In [10]:
def compute_metrics(pred):
    """Compute WER for model evaluation."""
    metric = evaluate.load('wer')
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {'wer': wer}

In [11]:
def main():
    # Load and preprocess datasets
    train_dataset, val_dataset = load_datasets()

    train_dataset = train_dataset.map(prepare_dataset, num_proc=4)
    val_dataset = val_dataset.map(prepare_dataset, num_proc=4)

    # Initialize the model
    model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID)
    model.config.forced_decoder_ids = None
    model.config.suppress_tokens = []

    # Prepare data collator
    data_collator = DataCollatorSpeechSeq2SeqWithPadding(
        processor=processor,
        decoder_start_token_id=model.config.decoder_start_token_id,
    )

    # Set up training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        num_train_epochs=EPOCHS,
        evaluation_strategy='epoch',
        logging_strategy='epoch',
        save_strategy='epoch',
        predict_with_generate=True,
        generation_max_length=225,
        fp16=True,
        load_best_model_at_end=True,
        metric_for_best_model='wer',
        greater_is_better=False,
        report_to=['tensorboard'],
        dataloader_num_workers=4,
        save_total_limit=2,
    )

    # Initialize trainer
    trainer = Seq2SeqTrainer(
        args=training_args,
        model=model,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=processor.feature_extractor,
    )

    # Start training
    trainer.train()

    # Save the model
    trainer.save_model(OUTPUT_DIR)
    processor.save_pretrained(OUTPUT_DIR)


In [13]:
pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.10.0


In [12]:
main()

common_voice_19_0.py:   0%|          | 0.00/8.17k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/4.00k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/138k [00:00<?, ?B/s]

n_shards.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

ne-NP_train_0.tar:   0%|          | 0.00/8.16M [00:00<?, ?B/s]

ne-NP_dev_0.tar:   0%|          | 0.00/3.84M [00:00<?, ?B/s]

ne-NP_test_0.tar:   0%|          | 0.00/5.44M [00:00<?, ?B/s]

ne-NP_other_0.tar:   0%|          | 0.00/16.6M [00:00<?, ?B/s]

ne-NP_invalidated_0.tar:   0%|          | 0.00/1.83M [00:00<?, ?B/s]

transcript/ne-NP/train.tsv:   0%|          | 0.00/134k [00:00<?, ?B/s]

transcript/ne-NP/dev.tsv:   0%|          | 0.00/50.2k [00:00<?, ?B/s]

transcript/ne-NP/test.tsv:   0%|          | 0.00/71.1k [00:00<?, ?B/s]

transcript/ne-NP/other.tsv:   0%|          | 0.00/223k [00:00<?, ?B/s]

transcript/ne-NP/invalidated.tsv:   0%|          | 0.00/23.8k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 381it [00:00, 26916.00it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 147it [00:00, 22897.56it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 205it [00:00, 26832.03it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 638it [00:00, 29457.04it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 67it [00:00, 64720.95it/s]


Map (num_proc=4):   0%|          | 0/381 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/205 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

ImportError: To be able to use evaluate-metric/wer, you need to install the following dependencies['jiwer'] using 'pip install jiwer' for instance'