In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/audio-dataset-male-marathi-english/english/txt.done.data
/kaggle/input/audio-dataset-male-marathi-english/english/calculate_duration.pl
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_02106.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_02167.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_04383.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00165.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_01895.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_02607.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_02527.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_03849.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_01315.wav
/kaggle/input/audio-dataset-male-marathi-english/

In [2]:
import os
import librosa
from datasets import Dataset
import pandas as pd
import re

In [3]:
# Path to your dataset
audio_dir = "/kaggle/input/audio-dataset-male-marathi-english/english/wav"  # Folder containing WAV files
txt_file = "/kaggle/input/audio-dataset-male-marathi-english/english/txt.done.data"  # Your txt.done.data file


In [4]:
# Define a regular expression pattern to match the filename and transcription inside parentheses
pattern = r'\(\s*(\S+)\s+"(.+)"\s*\)'  # Matches: (filename "transcript")

# List to hold data
data = []

# Open the txt.done.data file and extract filenames and transcripts
with open(txt_file, "r") as f:
    for line in f:
        line = line.strip()
        if line:
            match = re.match(pattern, line)
            if match:
                filename = match.group(1).strip("() ")  # Clean filename
                transcript = match.group(2)  # Get transcript
                
                # Construct full audio path using os.path.join
                full_audio_path = os.path.join(audio_dir, f"{filename}.wav")  # Append .wav
                
                # Add the file and transcript to the data list
                data.append({
                    "file": full_audio_path,
                    "text": transcript
                })
            else:
                print(f"Error parsing line: {line}")

# Verify the collected data
print("Collected Data:")
for entry in data:
    print(f"Audio file: {entry['file']}, Transcript: {entry['text']}")

# Create a Dataset from the collected data
dataset = Dataset.from_dict({ 
    "file": [entry["file"] for entry in data], 
    "text": [entry["text"] for entry in data] 
})

def load_audio(batch):
    # Check if the audio file exists
    if not os.path.exists(batch["file"]):
        raise FileNotFoundError(f"Audio file not found: {batch['file']}")

    # Load the audio file and return as array and sampling rate
    audio_array, sampling_rate = librosa.load(batch["file"], sr=16000)  # Whisper expects 16kHz audio
    batch["audio"] = audio_array
    batch["sampling_rate"] = sampling_rate
    return batch

# Apply audio loading function to the dataset
dataset = dataset.map(load_audio)

# Verify the dataset
print("Loaded Dataset Sample:")
# print(dataset[0])

Collected Data:
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00001.wav, Transcript:  Author of the danger trail, Philip Steels, etc. 
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00002.wav, Transcript:  Not at this particular case, Tom apologized, Whittemore. 
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00003.wav, Transcript:  For the twentieth time, that evening, the two men, shook hands. 
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00004.wav, Transcript:  Lord, but I'm glad to see you again, Phil. 
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00005.wav, Transcript:  Will we ever forget it. 
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00006.wav, Transcript:  God bless 'em, I hope I'll go on seeing them forever.

Map:   0%|          | 0/5578 [00:00<?, ? examples/s]

Loaded Dataset Sample:
{'file': '/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00001.wav', 'text': ' Author of the danger trail, Philip Steels, etc. ', 'audio': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [5]:
dataset.info

DatasetInfo(description='', citation='', homepage='', license='', features={'file': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'audio': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'sampling_rate': Value(dtype='int64', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

In [6]:
from transformers import WhisperProcessor
import numpy as np

# Load the Whisper processor (tokenizer + feature extractor)
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

def preprocess(batch):
    try:
        # Ensure "audio" is in the expected format (list of floats or np.ndarray)
        if not isinstance(batch["audio"], (list, np.ndarray)):
            raise ValueError(f"Expected audio to be a list or ndarray, but got {type(batch['audio'])}")

        # Convert audio to a numpy array (in case it's a list)
        audio_array = np.array(batch["audio"], dtype=np.float32)

        # Check if sampling rate is provided and valid
        if "sampling_rate" not in batch or batch["sampling_rate"] is None:
            raise ValueError("Sampling rate is missing or invalid in the batch.")
        
        # Process the audio to get input features
        input_features = processor(
            audio_array, 
            sampling_rate=batch["sampling_rate"], 
            return_tensors="pt"
        ).input_features
        
        # Process the text to get labels
        if "text" not in batch or not isinstance(batch["text"], str):
            raise ValueError("Text is missing or not in valid format.")
        
        labels = processor.tokenizer(batch["text"], return_tensors="pt").input_ids

        # Add the processed data to the batch
        batch["input_features"] = input_features.squeeze(0)
        batch["labels"] = labels.squeeze(0)

    except Exception as e:
        print(f"Error processing batch: {batch}")
        print(f"Error message: {str(e)}")
    
    return batch


subset_dataset = dataset.map(preprocess, batched=False)

# Remove unnecessary columns after preprocessing
columns_to_remove = ["audio", "sampling_rate", "file", "text"]
subset_dataset = subset_dataset.remove_columns([col for col in columns_to_remove if col in subset_dataset.column_names])

# # Display a sample of the processed dataset
# print("Processed Dataset Sample:")
# print(subset_dataset[0])


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Map:   0%|          | 0/5578 [00:00<?, ? examples/s]

In [7]:
subset_dataset

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 5578
})

In [8]:
!pip install evaluate
!pip install jiwer
import evaluate

# Load the Word Error Rate (WER) metric using the evaluate library
wer_metric = evaluate.load("wer")

  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.10.0


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [9]:
wer_metric

EvaluationModule(name: "wer", module_type: "metric", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Compute WER score of transcribed segments against references.

Args:
    references: List of references for each speech input.
    predictions: List of transcriptions to score.
    concatenate_texts (bool, default=False): Whether to concatenate all input texts or compute WER iteratively.

Returns:
    (float): the word error rate

Examples:

    >>> predictions = ["this is the prediction", "there is an other sample"]
    >>> references = ["this is the reference", "there is another one"]
    >>> wer = evaluate.load("wer")
    >>> wer_score = wer.compute(predictions=predictions, references=references)
    >>> print(wer_score)
    0.5
""", stored examples: 0)

In [10]:
import os

# Set WANDB API key
os.environ["WANDB_API_KEY"] = "a592e2058e20b7e0aff254b848a6cb6821ec37d2"

In [11]:
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
from torch.nn.utils.rnn import pad_sequence


# Load the pre-trained Whisper model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")


def data_collator(batch):
    # Convert input features to tensors, and pad them to the same length
    input_features = [torch.tensor(item["input_features"]) if isinstance(item["input_features"], list) else item["input_features"] for item in batch]
    input_features = pad_sequence(input_features, batch_first=True)

    # Convert labels to tensors, and pad them to the same length
    labels = [torch.tensor(item["labels"]) if isinstance(item["labels"], list) else item["labels"] for item in batch]
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Padding labels with -100 to ignore during loss calculation

    return {"input_features": input_features, "labels": labels}

# Define metrics to evaluate the model (optional)
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Decode predictions and labels to text (if needed for metrics)
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # Compute word error rate (WER)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-fine-tuned",
    per_device_train_batch_size=1,     # Adjust according to your hardware
    gradient_accumulation_steps=1,
    eval_strategy="epoch",             # Use eval_strategy instead of deprecated evaluation_strategy
    learning_rate=5e-5,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    num_train_epochs=2,               # Change this based on how long you want to train
    save_total_limit=2,
    fp16=True,                        # Mixed precision for faster training (optional, only with compatible hardware)
    predict_with_generate=True,
    remove_unused_columns=False        # Ensure that we do not remove columns automatically
)

# Ensure unnecessary columns are removed after preprocessing
subset_dataset = subset_dataset.remove_columns([col for col in ["audio", "sampling_rate", "file", "text"] if col in subset_dataset.column_names])
from sklearn.model_selection import train_test_split
import numpy as np

dataset_size = len(subset_dataset)

# Shuffle the indices for random split
indices = np.random.permutation(dataset_size)

# Define the split point (e.g., 80% train, 20% eval)
split_index = int(dataset_size * 0.8)

# Split the indices for training and evaluation sets
train_indices = indices[:split_index]
eval_indices = indices[split_index:]

# Convert the NumPy int64 indices to Python native integers
train_indices = [int(i) for i in train_indices]
eval_indices = [int(i) for i in eval_indices]

# Manually split the dataset into training and evaluation sets
train_dataset = subset_dataset.select(train_indices)
eval_dataset = subset_dataset.select(eval_indices)
# Set up the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,      # Use training set
    eval_dataset=eval_dataset,        # Use evaluation set
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,  # Set tokenizer to the processor for audio processing
    compute_metrics=compute_metrics   # Optional, remove if not using metrics
)

# Fine-tune the model
trainer.train()



config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33myatharth-thakare211[0m ([33myatharth-thakare211-vishwakarma-institute-of-technology-pune[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113192488889784, max=1.0…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Wer
1,0.2527,0.240376,0.092014
2,0.0673,0.252977,0.112104


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=4462, training_loss=0.19177711333917535, metrics={'train_runtime': 6853.7614, 'train_samples_per_second': 1.302, 'train_steps_per_second': 0.651, 'total_flos': 2.57533611245568e+18, 'train_loss': 0.19177711333917535, 'epoch': 2.0})

In [20]:

# Save the fine-tuned model
trainer.save_model("./whisper-fine-tuned")

# Save the processor (tokenizer/feature extractor)
processor.save_pretrained("./whisper-fine-tuned")

print("Model fine-tuning and saving completed.")

Model fine-tuning and saving completed.


In [21]:
import shutil

In [22]:
# After fine-tuning, save the model and processor
output_dir = "./whisper-fine-tuned"  # Specify the output directory
model.save_pretrained(output_dir)
processor.save_pretrained(output_dir)


[]

In [24]:
# Move the model directory to the output folder for saving
shutil.copytree(output_dir, '/kaggle/working/fine_tuned')


'/kaggle/working/fine_tuned'

In [None]:
print(hello)