In [1]:
from datasets import Dataset, Audio
import pandas as pd
from transformers import WhisperProcessor
from transformers import WhisperForConditionalGeneration, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("dataset/metadata.csv")
df["file"] = df["file"].apply(lambda x: f"dataset/wavs_converted/{x}.wav")
dataset = Dataset.from_pandas(df)
dataset = dataset.cast_column("file", Audio(sampling_rate=16000))

In [3]:
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="ta", task="transcribe")

def prepare(example):
    audio = example["file"]
    inputs = processor(audio["array"], sampling_rate=16000, return_tensors="pt")
    input_values = inputs.input_features.squeeze(0)
    labels = processor.tokenizer(example["text"], return_tensors="pt").input_ids.squeeze(0)
    return {"input_features": input_values, "labels": labels}

processed_dataset = dataset.map(prepare)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5395/5395 [06:07<00:00, 14.69 examples/s]


In [4]:
from dataclasses import dataclass
import torch

@dataclass
class CustomDataCollator:
    processor: any

    def __call__(self, features):
        # Extract input features and labels
        input_features = [{"input_features": f["input_features"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        # Pad input features
        batch = self.processor.feature_extractor.pad(
            input_features, return_tensors="pt"
        )

        # Pad labels directly using tokenizer
        labels_batch = self.processor.tokenizer.pad(
            label_features,
            return_tensors="pt",
            padding=True
        )

        # Replace padding token ids with -100 for loss masking
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch["input_ids"] == self.processor.tokenizer.pad_token_id, -100
        )

        batch["labels"] = labels
        return batch

In [5]:
data_collator = CustomDataCollator(processor=processor)

In [6]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

training_args = TrainingArguments(
    output_dir="./whisper-tamil-finetuned",
    per_device_train_batch_size=8,
    learning_rate=1e-5,
    num_train_epochs=10,
    save_steps=500,
    fp16=True,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    tokenizer=processor.feature_extractor,
    data_collator = CustomDataCollator(processor=processor)
)

trainer.train()

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,2.3711
20,1.1379
30,0.8973
40,0.767
50,0.7633
60,0.6688
70,0.6173
80,0.617
90,0.5745
100,0.5348




TrainOutput(global_step=6750, training_loss=0.094480023947027, metrics={'train_runtime': 2816.2182, 'train_samples_per_second': 19.157, 'train_steps_per_second': 2.397, 'total_flos': 1.328188852224e+18, 'train_loss': 0.094480023947027, 'epoch': 10.0})