## Necessary Libraries


In [1]:
!pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio --quiet
!pip install jinja2 --quiet

In [2]:
from datasets import load_dataset, DatasetDict
from transformers import (
    WhisperTokenizer,
    WhisperProcessor,
    WhisperFeatureExtractor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

In [3]:
from datasets import Audio
from dataclasses import dataclass
from typing import Any, Dict, List, Union

import torch
import evaluate


## Defining Parameters

In [4]:
model_id = 'openai/whisper-small'
out_dir = 'whisper_tiny_np'
epochs = 5
batch_size = 4

## Preprocessing the Dataset

In [5]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
tokenizer = WhisperTokenizer.from_pretrained(model_id, language='Nepali', task='transcribe')
processor = WhisperProcessor.from_pretrained(model_id, language='Nepali', task='transcribe')


In [6]:
train_np = load_dataset("Naruto1/ASR_dataset", split="train", trust_remote_code=True)
# val_np = load_dataset("Naruto1/ASR_dataset", split="test", trust_remote_code=True)

Resolving data files:   0%|          | 0/200 [00:00<?, ?it/s]

In [7]:
train_np[0]

{'text': 'बलिउडका विभिन्न नायिकासँग अफेयर विवाहसूत्रमा नबाँधिएको प्रकरण बलिष्ठ शरीर बाहुबलि हिरो हरिण मारेको ओरोपितजस्ता विविध कारणले प्रायः चर्चा विवादमा आइरहने कलाकार सलमान खान उहाँको दानवीर दयालु स्वभावका विषयमा कमै चर्चा'}

In [8]:
train_np

Dataset({
    features: ['text'],
    num_rows: 100
})

In [9]:
# atc_dataset_train = atc_dataset_train.cast_column('audio', Audio(sampling_rate=16000))
# atc_dataset_valid = atc_dataset_valid.cast_column('audio', Audio(sampling_rate=16000))
train_np = load_dataset("fsicoli/common_voice_19_0", "ne-NP", split="train", trust_remote_code=True)
val_np = load_dataset("fsicoli/common_voice_19_0", "ne-NP", split="test", trust_remote_code=True)

In [10]:
train_np[0]

{'client_id': '9f8a47cee5574b287a8f93f5498d81115cf1dfbd718ead4f2265e4400f7de0f017a58a2c8c1245e0d3ceeccffa5b110322c4f784aa8a9785e3219557cb44395e',
 'path': '/root/.cache/huggingface/datasets/downloads/extracted/2078d4f647abb87146c4e6361776aff17e038b4472a795fd02ab22d7c2574c59/ne-NP_train_0/common_voice_ne-NP_35314089.mp3',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/2078d4f647abb87146c4e6361776aff17e038b4472a795fd02ab22d7c2574c59/ne-NP_train_0/common_voice_ne-NP_35314089.mp3',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          7.00167766e-06, -4.02070254e-05, -3.65305859e-05]),
  'sampling_rate': 48000},
 'sentence': 'म पनि जान्छु है त अहिले लाई ।',
 'up_votes': 4,
 'down_votes': 0,
 'age': 'thirties',
 'gender': 'male_masculine',
 'accent': 'nepali',
 'locale': 'ne-NP',
 'segment': '',
 'variant': ''}

Resmapling at 16khz

In [11]:
train_np = train_np.cast_column('audio', Audio(sampling_rate=16000))
val_np = val_np.cast_column('audio', Audio(sampling_rate=16000))

In [12]:
train_np

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 381
})

In [13]:
train_np = train_np.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
val_np = val_np.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

In [14]:
for i, data in enumerate(train_np):
  if not 'sentence' in data.keys() or not 'audio' in data.keys():
    print(i, 'not found')

In [15]:
def prepare_dataset(batch):
  audio = batch['audio']
  batch['input_features'] = feature_extractor(audio['array'], sampling_rate=audio['sampling_rate']).input_features[0]
  batch['labels'] =  tokenizer(batch['sentence']).input_ids
  return batch


train_np = train_np.map(
    prepare_dataset,
    num_proc=1
)

val_np = val_np.map(
    prepare_dataset,
    num_proc=1
)

In [16]:
train_np[0].keys(), val_np[0].keys()

(dict_keys(['audio', 'sentence', 'variant', 'input_features', 'labels']),
 dict_keys(['audio', 'sentence', 'variant', 'input_features', 'labels']))

In [17]:
input_str = train_np[0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Input:                 म पनि जान्छु है त अहिले लाई ।
Decoded w/ special:    <|startoftranscript|><|ne|><|transcribe|><|notimestamps|>म पनि जान्छु है त अहिले लाई ।<|endoftext|>
Decoded w/out special: म पनि जान्छु है त अहिले लाई ।
Are equal:             True


### Preparing the Model


In [19]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor = AutoProcessor.from_pretrained("/kaggle/input/whisper_nepali/transformers/default/1/my_model_directory")
model = AutoModelForSpeechSeq2Seq.from_pretrained("/kaggle/input/whisper_nepali/transformers/default/1/my_model_directory")

model.generation_config.task = 'transcribe'
model.generation_config.language = 'nepali'
model.generation_config.forced_decoder_ids = None

In [20]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [21]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

## Defining evaluation metrices

In [22]:
metric = evaluate.load('wer')

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {'wer': wer}

In [23]:
training_args = Seq2SeqTrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=1,
    learning_rate=0.00001,
    warmup_steps=500,
    bf16=False,
    fp16=True,
    num_train_epochs=epochs,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    predict_with_generate=True,
    generation_max_length=225,
    report_to=['tensorboard'],
    load_best_model_at_end=True,
    metric_for_best_model='wer',
    greater_is_better=False,
    dataloader_num_workers=2,
    save_total_limit=2,
    lr_scheduler_type='constant',
    seed=42,
    data_seed=42
)



In [24]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_np,
    eval_dataset=val_np,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [27]:
dataset = load_dataset("spktsagar/openslr-nepali-asr-cleaned", name="cleaned", split='train')

openslr-nepali-asr-cleaned.py:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

The repository for spktsagar/openslr-nepali-asr-cleaned contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/spktsagar/openslr-nepali-asr-cleaned.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


utt_spk_text_clean.tsv:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

asr_nepali_0.zip:   0%|          | 0.00/379M [00:00<?, ?B/s]

asr_nepali_1.zip:   0%|          | 0.00/372M [00:00<?, ?B/s]

asr_nepali_2.zip:   0%|          | 0.00/376M [00:00<?, ?B/s]

asr_nepali_3.zip:   0%|          | 0.00/367M [00:00<?, ?B/s]

asr_nepali_4.zip:   0%|          | 0.00/372M [00:00<?, ?B/s]

asr_nepali_5.zip:   0%|          | 0.00/366M [00:00<?, ?B/s]

asr_nepali_6.zip:   0%|          | 0.00/376M [00:00<?, ?B/s]

asr_nepali_7.zip:   0%|          | 0.00/377M [00:00<?, ?B/s]

asr_nepali_8.zip:   0%|          | 0.00/375M [00:00<?, ?B/s]

asr_nepali_9.zip:   0%|          | 0.00/371M [00:00<?, ?B/s]

asr_nepali_a.zip:   0%|          | 0.00/376M [00:00<?, ?B/s]

asr_nepali_b.zip:   0%|          | 0.00/373M [00:00<?, ?B/s]

asr_nepali_c.zip:   0%|          | 0.00/370M [00:00<?, ?B/s]

asr_nepali_d.zip:   0%|          | 0.00/377M [00:00<?, ?B/s]

asr_nepali_e.zip:   0%|          | 0.00/371M [00:00<?, ?B/s]

asr_nepali_f.zip:   0%|          | 0.00/368M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/157905 [00:00<?, ? examples/s]

In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss,Wer
1,0.0547,0.515619,56.312165
2,0.0307,0.515328,54.781943
3,0.0242,0.546078,54.705432
4,0.0194,0.553506,55.011477
5,0.0193,0.563238,54.781943


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fo

TrainOutput(global_step=240, training_loss=0.029658036927382152, metrics={'train_runtime': 1118.5552, 'train_samples_per_second': 1.703, 'train_steps_per_second': 0.215, 'total_flos': 5.612986036224e+17, 'train_loss': 0.029658036927382152, 'epoch': 5.0})

In [28]:
dataset[0]

{'utterance_id': '4aa1fdca33',
 'speaker_id': '6a6d1',
 'utterance': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/b3f557561e70f0ebaa552943ee134754d6a731f729ff40b6e18072b1707b15c7/cleaned/asr_nepali/data/4a/4aa1fdca33.flac',
  'array': array([-3.23486328e-03, -2.38037109e-03,  6.43920898e-03, ...,
          9.15527344e-05,  9.15527344e-05, -5.49316406e-04]),
  'sampling_rate': 16000},
 'transcription': '००७ मिलको दूरीमा',
 'num_frames': 43200}

### For inference

In [29]:
from torch.utils.data import DataLoader
dataloader = DataLoader(val_np, batch_size=4, collate_fn=data_collator)

In [36]:
batch["input_features"].shape

torch.Size([4, 80, 3000])

In [31]:
# Iterate through batches and get model predictions
for batch in dataloader:
    input_features = batch["input_features"].to('cuda')
    labels = batch["labels"]

    # Perform inference (using no_grad for evaluation)
    with torch.no_grad():
        generated_ids = model.generate(input_features, language='ne')

    # Decode the predicted token IDs into text
    predictions = processor.batch_decode(generated_ids, skip_special_tokens=True)
    sents = processor.batch_decode(labels, skip_special_tokens=True)
    # Print or store predictions
    for pred, sen in zip(predictions, sents):
        print(f'GT:{sen}.......... Pred: {pred}')
        
    break

GT:पानी तिर्खा लाग्यो ।.......... Pred: पानी तिर्का लाग्यो ।
GT:माग र महत्त्व डिजेलकै बढ्यो ।.......... Pred: माग्र महत्व डिजरकै पढ्या।
GT:काम र पढाइले गर्दा नै हो धेरै त ।.......... Pred: काम्र पढाइले कर्दा नै हुथिरी त ।
GT:हिजो गरेको आमालाई फोन ।.......... Pred: एज गरेको हामालाई फुन ।


In [37]:
# Define the directory where you want to save the model
save_directory = "./my_model_directory"

# Save the model
model.save_pretrained(save_directory)

# Save the processor/tokenizer (if applicable)
processor.save_pretrained(save_directory)  # or tokenizer.save_pretrained(save_directory)


[]

In [38]:
!zip -r /kaggle/working/my_model_directory.zip /kaggle/working/my_model_directory

  pid, fd = os.forkpty()


  adding: kaggle/working/my_model_directory/ (stored 0%)
  adding: kaggle/working/my_model_directory/preprocessor_config.json (deflated 42%)
  adding: kaggle/working/my_model_directory/vocab.json (deflated 69%)
  adding: kaggle/working/my_model_directory/generation_config.json (deflated 72%)
  adding: kaggle/working/my_model_directory/normalizer.json (deflated 81%)
  adding: kaggle/working/my_model_directory/model.safetensors (deflated 8%)
  adding: kaggle/working/my_model_directory/merges.txt (deflated 54%)
  adding: kaggle/working/my_model_directory/special_tokens_map.json (deflated 80%)
  adding: kaggle/working/my_model_directory/tokenizer_config.json (deflated 96%)
  adding: kaggle/working/my_model_directory/config.json (deflated 59%)
  adding: kaggle/working/my_model_directory/added_tokens.json (deflated 80%)
