## Necessary Libraries


In [1]:
!pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio
!pip install jinja2

Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting tensorboard
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting gradio
  Downloading gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting datasets[audio]
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting ffmpy (from gradio)
  Down

In [2]:
from datasets import load_dataset, DatasetDict, concatenate_datasets
from transformers import (
    WhisperTokenizer,
    WhisperProcessor,
    WhisperFeatureExtractor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

In [3]:
from datasets import Audio
from dataclasses import dataclass
from typing import Any, Dict, List, Union

import torch
import evaluate


## Defining Parameters

In [4]:
model_id = 'openai/whisper-small'
out_dir = 'whisper_tiny_np'
epochs = 30
batch_size = 4

## Preprocessing the Dataset

In [5]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
tokenizer = WhisperTokenizer.from_pretrained(model_id, language='Nepali', task='transcribe')
processor = WhisperProcessor.from_pretrained(model_id, language='Nepali', task='transcribe')


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [6]:
# train_np = load_dataset("Naruto1/ASR_dataset", split="train", trust_remote_code=True)
# # val_np = load_dataset("Naruto1/ASR_dataset", split="test", trust_remote_code=True)

In [7]:
# atc_dataset_train = atc_dataset_train.cast_column('audio', Audio(sampling_rate=16000))
# atc_dataset_valid = atc_dataset_valid.cast_column('audio', Audio(sampling_rate=16000))
data = load_dataset("amitpant7/nepali-speech-to-text")


README.md:   0%|          | 0.00/598 [00:00<?, ?B/s]

train.0-00000-of-00001.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train.1-00000-of-00001.parquet:   0%|          | 0.00/469M [00:00<?, ?B/s]

train.2-00000-of-00001.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

Generating train.0 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train.1 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train.2 split:   0%|          | 0/650 [00:00<?, ? examples/s]

In [8]:
print("Available splits:", data.keys())

Available splits: dict_keys(['train.0', 'train.1', 'train.2'])


In [9]:
print("Available splits:", data.keys())

# Combine all splits into a single dataset
# combined_dataset = data['train.0']  # Start with the first split
total_splits =[]
for i in range(0, len(data)):
    split_name = f'train.{i}'
    if split_name in data:
        total_splits +=[data[split_name]]
    else:
        break  # Stop if we've reached the end of the splits


Available splits: dict_keys(['train.0', 'train.1', 'train.2'])


In [10]:
 combined_dataset = concatenate_datasets(total_splits)

In [11]:
len(combined_dataset)

2650

In [12]:
# Split the dataset into 90% train and 10% test
split_data = combined_dataset.train_test_split(test_size=0.1)

# Access the train and test sets
train_np = split_data['train']
val_np = split_data['test']

print(f"Train dataset size: {len(train_np)}")
print(f"Test dataset size: {len(train_np)}")

Train dataset size: 2385
Test dataset size: 2385


In [13]:
train_np[30]

{'audio': {'path': 'nep_2099_1647836016.wav',
  'array': array([0., 0., 0., ..., 0., 0., 0.]),
  'sampling_rate': 48000},
 'transcription': 'पटायामा रहेको रमणीय स्थलहरूको दृश्य थाइल्यान्ड खाडीमा भएको ठुलो क्षेत्र हो'}

Resmapling at 16khz

In [14]:
train_np = train_np.cast_column('audio', Audio(sampling_rate=16000))
val_np = val_np.cast_column('audio', Audio(sampling_rate=16000))

In [15]:
def prepare_dataset(batch):
  audio = batch['audio']
  batch['input_features'] = feature_extractor(audio['array'], sampling_rate=audio['sampling_rate']).input_features[0]
  batch['labels'] =  tokenizer(batch['transcription']).input_ids
  return batch


train_np = train_np.map(
    prepare_dataset,
    num_proc=2
)

val_np = val_np.map(
    prepare_dataset,
    num_proc=2
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/2385 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/265 [00:00<?, ? examples/s]

In [16]:
train_np[0].keys(), val_np[0].keys()

(dict_keys(['audio', 'transcription', 'input_features', 'labels']),
 dict_keys(['audio', 'transcription', 'input_features', 'labels']))

In [17]:
input_str = train_np[0]["transcription"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Input:                 तपाईँ उत्तम भोजन चाहनुहुन्छ भने हिमालयन होटल सिल्‍वर ओक होटल जानुहोस्
Decoded w/ special:    <|startoftranscript|><|ne|><|transcribe|><|notimestamps|>तपाईँ उत्तम भोजन चाहनुहुन्छ भने हिमालयन होटल सिल्‍वर ओक होटल जानुहोस्<|endoftext|>
Decoded w/out special: तपाईँ उत्तम भोजन चाहनुहुन्छ भने हिमालयन होटल सिल्‍वर ओक होटल जानुहोस्
Are equal:             True


### Preparing the Model


In [18]:
model = WhisperForConditionalGeneration.from_pretrained(model_id)

model.generation_config.task = 'transcribe'
model.generation_config.language = 'nepali'
model.generation_config.forced_decoder_ids = None

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [19]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [20]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

## Defining evaluation metrices

In [21]:
metric = evaluate.load('wer')

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {'wer': wer}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-np",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=1,
    learning_rate=0.00001,
    warmup_steps=500,
    bf16=False,
    fp16=True,
    num_train_epochs=epochs,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    predict_with_generate=True,
    generation_max_length=225,
    report_to=['tensorboard'],
    load_best_model_at_end=True,
    metric_for_best_model='wer',
    greater_is_better=False,
    dataloader_num_workers=2,
    save_total_limit=2,
    lr_scheduler_type='constant',
    seed=42,
    data_seed=42
)



In [23]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_np,
    eval_dataset=val_np,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [24]:
trainer.train()

  self.pid = os.fork()
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Wer
1,0.3677,0.255623,44.312232
2,0.1508,0.227675,39.362208
3,0.0765,0.229518,35.030938
4,0.0427,0.249626,34.745359
5,0.0275,0.268735,35.173727
6,0.0206,0.287416,33.745835
7,0.0144,0.319763,35.316516
8,0.0134,0.324508,32.8891
9,0.0114,0.334728,33.269871
10,0.0102,0.355963,34.554974


  self.pid = os.fork()
You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os

TrainOutput(global_step=17910, training_loss=0.029174629873832732, metrics={'train_runtime': 25685.2076, 'train_samples_per_second': 2.786, 'train_steps_per_second': 0.697, 'total_flos': 2.0648285392896e+19, 'train_loss': 0.029174629873832732, 'epoch': 30.0})

### For inference

In [25]:
# from torch.utils.data import DataLoader
# dataloader = DataLoader(val_np, batch_size=4, collate_fn=data_collator)

In [26]:
# model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [27]:
# # Iterate through batches and get model predictions
# for batch in dataloader:
#     input_features = batch["input_features"]
#     labels = batch["labels"]

#     # Perform inference (using no_grad for evaluation)
#     with torch.no_grad():
#         generated_ids = model.generate(input_features, language='ne')

#     # Decode the predicted token IDs into text
#     predictions = processor.batch_decode(generated_ids, skip_special_tokens=True)
#     sents = processor.batch_decode(labels, skip_special_tokens=True)
#     # Print or store predictions
#     for pred, sen in zip(predictions, sents):
#         print(f'GT:{sen}.......... Pred: {pred}')
        
#     break