## Installing Libraries

In [2]:
!pip install datasets transformers librosa jiwer --quiet
!pip install evaluate --quiet
# dataset is used to handle and preprocess dataset
# transformers is needed to run whisper
# librosa for audio analysis
# jiwer and evaluate for cer

## Importing Libraries

In [10]:
# --- Imports ---
import pandas as pd
import torch
import os
import evaluate
from datasets import Dataset, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from dataclasses import dataclass
from typing import Any, Dict, List, Union

## Defining Configurations

In [None]:
# --- Configuration ---
# Define the base path for the Kaggle input data
KAGGLE_INPUT_DIR = "./data"
TRAIN_CSV_PATH = os.path.join(KAGGLE_INPUT_DIR, "train.csv")
TEST_CSV_PATH = os.path.join(KAGGLE_INPUT_DIR, "test.csv")
SAMPLE_SUBMISSION_PATH = os.path.join(KAGGLE_INPUT_DIR, "sample.csv")

# Define paths for audio directories
TRAIN_AUDIO_DIR = KAGGLE_INPUT_DIR
TEST_AUDIO_DIR = KAGGLE_INPUT_DIR

# Model checkpoint to use
MODEL_CHECKPOINT = "openai/whisper-small"

# Training parameters
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 1e-5
NUM_TRAIN_EPOCHS = 1
SAVE_STEPS = 1000
EVAL_STEPS = 1000
LOGGING_STEPS = 500
OUTPUT_DIR = "./whisper-uyghur-asr" # Directory to save model checkpoints and logs
FP16 = torch.cuda.is_available()
MAX_DURATION_IN_SECONDS = 30 # Max audio duration to filter out very long samples
TRAIN_DATA_SUBSET_RATIO = 0.5 # parameter to use a subset of training data

## Loading the Dataset

In [None]:
# --- 1. Data Loading and Preprocessing ---

print("Loading dataframes...")
train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df = pd.read_csv(TEST_CSV_PATH)
sample_submission_df = pd.read_csv(SAMPLE_SUBMISSION_PATH)

# Construct full file paths for audio
train_df["filepath"] = train_df["filepath"].apply(lambda x: os.path.join(TRAIN_AUDIO_DIR, x))
test_df["filepath"] = test_df["filepath"].apply(lambda x: os.path.join(TEST_AUDIO_DIR, x))

# Sample a subset of the training data
if TRAIN_DATA_SUBSET_RATIO < 1.0:
    print(f"Sampling {TRAIN_DATA_SUBSET_RATIO*100}% of the training data...")
    train_df = train_df.sample(frac=TRAIN_DATA_SUBSET_RATIO, random_state=42).reset_index(drop=True)

print(f"Train samples (after sampling): {len(train_df)}")
print(f"Test samples: {len(test_df)}")

# Create Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df).cast_column("filepath", Audio(sampling_rate=16000))
test_dataset = Dataset.from_pandas(test_df).cast_column("filepath", Audio(sampling_rate=16000))

## Loading Model

In [None]:
# --- 2. Initialize Processor and Model ---

print(f"Loading Whisper processor and model: {MODEL_CHECKPOINT}")
# Initialize WhisperProcessor
processor = WhisperProcessor.from_pretrained(MODEL_CHECKPOINT)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)

## Processing the Data

In [None]:
# --- 3. Prepare Data for Training ---

def prepare_dataset(batch):
    """
    Function to preprocess each batch of the dataset.
    It loads audio, extracts features, and tokenizes transcriptions.
    """
    # Load and resample audio data to 16kHz
    audio = batch["filepath"]
    # Compute log-Mel spectrograms from the audio input
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # Encode target text to label IDs
    batch["labels"] = processor.tokenizer(batch["transcription"]).input_ids
    return batch

# Apply the preprocessing function to the datasets
print("Preprocessing training dataset...")
train_dataset = train_dataset.map(
    prepare_dataset,
    remove_columns=train_dataset.column_names, # Remove original columns to keep only 'input_features' and 'labels'
    num_proc=1 # Setting num_proc to 1 to disable multiprocessing
)

print("Preprocessing test dataset (only input features needed for inference)...")
# For the test set, we only need input features, no labels
test_dataset_processed = test_dataset.map(
    lambda batch: {"input_features": processor.feature_extractor(batch["filepath"]["array"], sampling_rate=batch["filepath"]["sampling_rate"]).input_features[0]},
    remove_columns=test_dataset.column_names,
    num_proc=1 # Setting num_proc to 1 to disable multiprocessing
)

## Defining Data Collator

In [None]:
# --- 4. Define Data Collator ---

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    """
    Data collator that will dynamically pad the inputs received,
    and also collate them into a batch.
    """
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Get the input_ids and pad them
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding with -100 to ignore it in the loss calculation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # If bos token is appended in previous step,
        # cut it here as it's not needed for training
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Defining Metrics

In [None]:
# --- 5. Define Metrics ---

print("Loading CER metric...")
metric = evaluate.load("cer")

def compute_metrics(pred):
    """
    Function to compute Character Error Rate (CER) during evaluation.
    """
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 in the labels as we can't decode them
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute CER
    cer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

## Training Arguments

In [None]:
# --- 6. Set up Training Arguments ---

print("Setting up training arguments...")
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    warmup_steps=500, # Number of steps for linear warmup
    max_steps=NUM_TRAIN_EPOCHS * (len(train_dataset) // BATCH_SIZE),
    # num_train_epochs=NUM_TRAIN_EPOCHS,
    eval_strategy="steps",
    per_device_eval_batch_size=BATCH_SIZE,
    predict_with_generate=True, # Generate predictions during evaluation
    fp16=FP16, # Use mixed precision if GPU is available
    push_to_hub=False, # Do not push to Hugging Face Hub
    load_best_model_at_end=True, # Load the best model based on evaluation metric
    metric_for_best_model="cer", # Metric to monitor for best model selection
    greater_is_better=False, # For CER, lower is better
    save_steps=SAVE_STEPS,
    eval_steps=EVAL_STEPS,
    logging_steps=LOGGING_STEPS,
    report_to=["tensorboard"], # Report logs to TensorBoard
    do_eval=True,
)

# Split train_dataset into train and validation for evaluation during training
# Using a fixed seed for reproducibility
train_test_split = train_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset_split = train_test_split["train"]
eval_dataset_split = train_test_split["test"]

print(f"Training dataset size (after split): {len(train_dataset_split)}")
print(f"Validation dataset size: {len(eval_dataset_split)}")

## Training

In [5]:
# --- 7. Initialize Trainer and Train Model ---

print("Initializing Trainer...")
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_split,
    eval_dataset=eval_dataset_split,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor
)


# Commented the following code as kernel crashed midway. Restarted training after picking up from checkpoint.
# print("Starting training...")
# trainer.train()

# print("Training complete! Saving final model...")
# trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
# processor.save_pretrained(os.path.join(OUTPUT_DIR, "final_processor"))

Loading dataframes...
Sampling 50.0% of the training data...
Train samples (after sampling): 3787
Test samples: 1894
Loading Whisper processor and model: openai/whisper-small
Preprocessing training dataset...


Map:   0%|          | 0/3787 [00:00<?, ? examples/s]

Preprocessing test dataset (only input features needed for inference)...


Map:   0%|          | 0/1894 [00:00<?, ? examples/s]

Loading CER metric...
Setting up training arguments...
Training dataset size (after split): 3408
Validation dataset size: 379
Initializing Trainer...


  trainer = Seq2SeqTrainer(


## Resuming Training

In [7]:
checkpoint_path = "/kaggle/working/whisper-uyghur-asr/checkpoint-1000"

# Check if the checkpoint directory exists before attempting to resume
if os.path.exists(checkpoint_path) and os.listdir(checkpoint_path):
    print(f"Resuming training from checkpoint in: {checkpoint_path}")
    trainer.train(resume_from_checkpoint=checkpoint_path)
else:
    print("No valid checkpoint found. Starting training from scratch.")
    trainer.train()

print("\nTraining complete!")

Resuming training from checkpoint in: /kaggle/working/whisper-uyghur-asr/checkpoint-1000


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].



Training complete!


## Making Inference

In [8]:
# --- 8. Inference on Test Data ---

print("Performing inference on test data...")

predictions = []
# Iterate through the test dataset in batches for efficient inference
for i in range(0, len(test_dataset_processed), BATCH_SIZE):
    batch = test_dataset_processed[i : i + BATCH_SIZE]
    input_features = torch.tensor(batch["input_features"]).to(model.device)

    # Generate predictions
    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    # Decode predictions
    transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    predictions.extend(transcriptions)

print(f"Generated {len(predictions)} transcriptions for the test set.")

# --- 9. Create Submission File ---

print("Creating submission file...")
submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "transcription": predictions
})

submission_path = "submission.csv"
submission_df.to_csv(submission_path, index=False)

print(f"Submission file saved to {submission_path}")
print("Script finished successfully!")

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Performing inference on test data...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated 1894 transcriptions for the test set.
Creating submission file...
Submission file saved to submission.csv
Script finished successfully!
