## Necessary Libraries


In [1]:

!pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio
!pip install jinja2

Collecting transformers
  Downloading transformers-4.45.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting tensorboard
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting datasets[audio]
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.

In [2]:
from datasets import load_dataset, DatasetDict
from transformers import (
    WhisperTokenizer,
    WhisperProcessor,
    WhisperFeatureExtractor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

In [3]:
from datasets import Audio
from dataclasses import dataclass
from typing import Any, Dict, List, Union

import torch
import evaluate


## Defining Parameters

In [4]:
model_id = 'openai/whisper-small'
out_dir = 'whisper_tiny_np'
epochs = 50
batch_size = 4

## Preprocessing the Dataset

In [5]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
tokenizer = WhisperTokenizer.from_pretrained(model_id, language='Nepali', task='transcribe')
processor = WhisperProcessor.from_pretrained(model_id, language='Nepali', task='transcribe')


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [6]:
# atc_dataset_train = atc_dataset_train.cast_column('audio', Audio(sampling_rate=16000))
# atc_dataset_valid = atc_dataset_valid.cast_column('audio', Audio(sampling_rate=16000))
train_np = load_dataset("fsicoli/common_voice_19_0", "ne-NP", split="train", trust_remote_code=True)
val_np = load_dataset("fsicoli/common_voice_19_0", "ne-NP", split="test", trust_remote_code=True)

common_voice_19_0.py:   0%|          | 0.00/8.17k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/4.00k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/138k [00:00<?, ?B/s]

n_shards.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

ne-NP_train_0.tar:   0%|          | 0.00/8.16M [00:00<?, ?B/s]

ne-NP_dev_0.tar:   0%|          | 0.00/3.84M [00:00<?, ?B/s]

ne-NP_test_0.tar:   0%|          | 0.00/5.44M [00:00<?, ?B/s]

ne-NP_other_0.tar:   0%|          | 0.00/16.6M [00:00<?, ?B/s]

ne-NP_invalidated_0.tar:   0%|          | 0.00/1.83M [00:00<?, ?B/s]

transcript/ne-NP/train.tsv:   0%|          | 0.00/134k [00:00<?, ?B/s]

transcript/ne-NP/dev.tsv:   0%|          | 0.00/50.2k [00:00<?, ?B/s]

transcript/ne-NP/test.tsv:   0%|          | 0.00/71.1k [00:00<?, ?B/s]

transcript/ne-NP/other.tsv:   0%|          | 0.00/223k [00:00<?, ?B/s]

transcript/ne-NP/invalidated.tsv:   0%|          | 0.00/23.8k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 381it [00:00, 76523.00it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 147it [00:00, 88118.15it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 205it [00:00, 59673.28it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 638it [00:00, 79772.42it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 67it [00:00, 40662.48it/s]


In [7]:
train_np[0]

{'client_id': '9f8a47cee5574b287a8f93f5498d81115cf1dfbd718ead4f2265e4400f7de0f017a58a2c8c1245e0d3ceeccffa5b110322c4f784aa8a9785e3219557cb44395e',
 'path': '/root/.cache/huggingface/datasets/downloads/extracted/2078d4f647abb87146c4e6361776aff17e038b4472a795fd02ab22d7c2574c59/ne-NP_train_0/common_voice_ne-NP_35314089.mp3',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/2078d4f647abb87146c4e6361776aff17e038b4472a795fd02ab22d7c2574c59/ne-NP_train_0/common_voice_ne-NP_35314089.mp3',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          7.00167766e-06, -4.02070254e-05, -3.65305859e-05]),
  'sampling_rate': 48000},
 'sentence': 'म पनि जान्छु है त अहिले लाई ।',
 'up_votes': 4,
 'down_votes': 0,
 'age': 'thirties',
 'gender': 'male_masculine',
 'accent': 'nepali',
 'locale': 'ne-NP',
 'segment': '',
 'variant': ''}

Resmapling at 16khz

In [8]:
train_np = train_np.cast_column('audio', Audio(sampling_rate=16000))
val_np = val_np.cast_column('audio', Audio(sampling_rate=16000))

In [9]:
train_np = train_np.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
val_np = val_np.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

In [10]:
for i, data in enumerate(train_np):
  if not 'sentence' in data.keys() or not 'audio' in data.keys():
    print(i, 'not found')

In [11]:
def prepare_dataset(batch):
  audio = batch['audio']
  batch['input_features'] = feature_extractor(audio['array'], sampling_rate=audio['sampling_rate']).input_features[0]
  batch['labels'] =  tokenizer(batch['sentence']).input_ids
  return batch


train_np = train_np.map(
    prepare_dataset,
    num_proc=1
)

val_np = val_np.map(
    prepare_dataset,
    num_proc=1
)

Map:   0%|          | 0/381 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

In [12]:
train_np[0].keys(), val_np[0].keys()

(dict_keys(['audio', 'sentence', 'variant', 'input_features', 'labels']),
 dict_keys(['audio', 'sentence', 'variant', 'input_features', 'labels']))

In [13]:
input_str = train_np[0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Input:                 म पनि जान्छु है त अहिले लाई ।
Decoded w/ special:    <|startoftranscript|><|ne|><|transcribe|><|notimestamps|>म पनि जान्छु है त अहिले लाई ।<|endoftext|>
Decoded w/out special: म पनि जान्छु है त अहिले लाई ।
Are equal:             True


### Preparing the Model


In [14]:
model = WhisperForConditionalGeneration.from_pretrained(model_id)

model.generation_config.task = 'transcribe'
model.generation_config.language = 'nepali'
model.generation_config.forced_decoder_ids = None

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [15]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [16]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

## Defining evaluation metrices

In [17]:
metric = evaluate.load('wer')

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {'wer': wer}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=1,
    learning_rate=0.00001,
    warmup_steps=500,
    bf16=False,
    fp16=True,
    num_train_epochs=epochs,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    predict_with_generate=True,
    generation_max_length=225,
    report_to=['tensorboard'],
    load_best_model_at_end=True,
    metric_for_best_model='wer',
    greater_is_better=False,
    dataloader_num_workers=2,
    save_total_limit=2,
    lr_scheduler_type='constant',
    seed=42,
    data_seed=42
)



In [19]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_np,
    eval_dataset=val_np,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()

  self.pid = os.fork()
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Wer
1,0.743,0.559718,72.915073
2,0.3074,0.511249,68.936496
3,0.1446,0.558583,69.013007
4,0.0828,0.555373,65.876052
5,0.0475,0.591383,67.482785
6,0.0327,0.660863,66.564652
7,0.021,0.67167,65.263963
8,0.0217,0.71982,66.029074
9,0.013,0.761301,67.482785


  self.pid = os.fork()
You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


In [None]:
dataset = load_dataset("spktsagar/openslr-nepali-asr-cleaned", name="cleaned", split='train')

In [None]:
dataset[0]

### For inference

In [None]:
from torch.utils.data import DataLoader
dataloader = DataLoader(val_np, batch_size=4, collate_fn=data_collator)

In [None]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [None]:
# Iterate through batches and get model predictions
for batch in dataloader:
    input_features = batch["input_features"]
    labels = batch["labels"]

    # Perform inference (using no_grad for evaluation)
    with torch.no_grad():
        generated_ids = model.generate(input_features, language='ne')

    # Decode the predicted token IDs into text
    predictions = processor.batch_decode(generated_ids, skip_special_tokens=True)
    sents = processor.batch_decode(labels, skip_special_tokens=True)
    # Print or store predictions
    for pred, sen in zip(predictions, sents):
        print(f'GT:{sen}.......... Pred: {pred}')
        
    break