In [1]:
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
from transformers import WhisperForConditionalGeneration

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large",language='hindi',task='transcribe')
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large",language='hindi',task='transcribe')
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large").to('cuda')

In [2]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-large",language='hindi',task='transcribe')

## Dataset processing(train+dev,test)

In [4]:
import os
import pandas as pd
import torchaudio
from datasets import Dataset, DatasetDict, Audio
from tqdm import tqdm

# Set the dataset path
DATASET_PATH = r"C:\Users\WORKSTATIONS\Desktop\BijoyashreeDas\COMMON_VOICE_HI\cv-corpus-21.0-2025-03-14\hi"
CLIPS_PATH = os.path.join(DATASET_PATH, "clips")

# Load train+dev as train
train_df = pd.read_csv(os.path.join(DATASET_PATH, "train.tsv"), sep="\t")
dev_df = pd.read_csv(os.path.join(DATASET_PATH, "dev.tsv"), sep="\t")
train_df = pd.concat([train_df, dev_df], ignore_index=True)

# Load test data
test_df = pd.read_csv(os.path.join(DATASET_PATH, "test.tsv"), sep="\t")

# Function to get full audio path
def get_audio_path(filename):
    return os.path.join(CLIPS_PATH, filename)

# Convert data to Hugging Face dataset format
def convert_to_hf_dataset(df):
    df = df[['path', 'sentence']].dropna()  # Keep only required columns
    df['audio'] = df['path'].apply(get_audio_path)  # Convert paths
    return Dataset.from_pandas(df[['audio', 'sentence']])  # Create HF dataset

# Convert train and test to Hugging Face format
commonvoice_train = convert_to_hf_dataset(train_df)
commonvoice_test = convert_to_hf_dataset(test_df)

# Define dataset dictionary
commonvoice_dataset = DatasetDict({
    "train": commonvoice_train,
    "test": commonvoice_test
})

# Cast the audio column to Hugging Face Audio format
commonvoice_dataset = commonvoice_dataset.cast_column("audio", Audio())

# Print dataset structure
print(commonvoice_dataset)


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 7563
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 3337
    })
})


In [5]:
# Get the first sample from the train set
first_sample = commonvoice_dataset["train"][0]

# Print the audio filename and transcription
print("Audio File:", first_sample["audio"]["path"])
print("Transcription:", first_sample["sentence"])


Audio File: C:\Users\WORKSTATIONS\Desktop\BijoyashreeDas\COMMON_VOICE_HI\cv-corpus-21.0-2025-03-14\hi\clips\common_voice_hi_26008353.mp3
Transcription: हमने उसका जन्मदिन मनाया।


## Total hours in train and test

In [7]:
import torchaudio
from tqdm import tqdm

# Function to calculate total duration of audio files
def get_total_duration(dataset):
    total_duration = 0.0  # In seconds
    for sample in tqdm(dataset, desc="Calculating duration"):
        audio_path = sample["audio"]["path"]
        waveform, sample_rate = torchaudio.load(audio_path)  # Load audio
        total_duration += waveform.shape[1] / sample_rate  # Compute duration (seconds)
    
    return total_duration / 3600  # Convert seconds to hours

# Compute total duration for train and test sets
train_hours = get_total_duration(commonvoice_dataset["train"])
test_hours = get_total_duration(commonvoice_dataset["test"])

# Print results
print(f"Total duration of Train set: {train_hours:.2f} hours")
print(f"Total duration of Test set: {test_hours:.2f} hours")


Calculating duration: 100%|████████████████████████████████████████████████████████| 7563/7563 [01:33<00:00, 80.48it/s]
Calculating duration: 100%|████████████████████████████████████████████████████████| 3337/3337 [00:46<00:00, 71.36it/s]

Total duration of Train set: 9.38 hours
Total duration of Test set: 4.73 hours





In [9]:
import pandas as pd
import torchaudio
import os
from tqdm import tqdm

# Set dataset path
DATASET_PATH = "C:/Users/WORKSTATIONS/Desktop/BijoyashreeDas/COMMON_VOICE_HI/cv-corpus-21.0-2025-03-14/hi"

# Load validated.tsv
validated_df = pd.read_csv(os.path.join(DATASET_PATH, "validated.tsv"), sep="\t")

# Get full path of each audio file
validated_df["audio_path"] = validated_df["path"].apply(lambda x: os.path.join(DATASET_PATH, "clips", x))

# Function to calculate total duration
def get_total_duration(file_paths):
    total_duration = 0.0  # In seconds
    for audio_path in tqdm(file_paths, desc="Calculating duration"):
        if os.path.exists(audio_path):
            waveform, sample_rate = torchaudio.load(audio_path)  # Load audio
            total_duration += waveform.shape[1] / sample_rate  # Compute duration (seconds)
    
    return total_duration / 3600  # Convert seconds to hours

# Compute total duration
validated_hours = get_total_duration(validated_df["audio_path"])

# Print results
print(f"Total duration of validated set: {validated_hours:.2f} hours")


Calculating duration: 100%|█████████████████████████████████████████████████████| 10979/10979 [00:30<00:00, 363.45it/s]

Total duration of validated set: 14.18 hours





## Resample audio files to 16kHz


In [6]:
import torchaudio
import torch
from datasets import Audio

# Get the first audio sample in the train set
sample = commonvoice_dataset["train"][0]["audio"]

# Print original sampling rate
print(f"Original Sampling Rate: {sample['sampling_rate']} Hz")



# Function to resample audio to 16kHz
def resample_audio(batch):
    waveform = batch["audio"]["array"]
    orig_sr = batch["audio"]["sampling_rate"]
    
    # Convert to PyTorch tensor
    waveform = torch.tensor(waveform, dtype=torch.float32)

    # Resample if needed
    if orig_sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_sr, 16000)
        waveform = resampler(waveform)

    return {"audio": {"array": waveform.numpy(), "sampling_rate": 16000}}  # Convert back to NumPy

# Apply the resampling function to train and test sets
commonvoice_dataset = commonvoice_dataset.map(resample_audio)

print("✅ Resampling complete. All audio is now at 16kHz.")



Original Sampling Rate: 32000 Hz


Map:   0%|          | 0/7563 [00:00<?, ? examples/s]

Map:   0%|          | 0/3337 [00:00<?, ? examples/s]

✅ Resampling complete. All audio is now at 16kHz.


In [7]:
import random

# Function to print sampling rate of N random samples
def print_random_sampling_rates(dataset, split, num_samples=3):
    print(f"\nSampling rates of {num_samples} random files from '{split}' set:")
    
    # Select random indices
    random_indices = random.sample(range(len(dataset[split])), num_samples)
    
    # Fetch and print sampling rates
    for idx in random_indices:
        sample = dataset[split][idx]["audio"]
        print(f"Sample {idx}: {sample['sampling_rate']} Hz")

# Print sampling rates for train and test sets
print_random_sampling_rates(commonvoice_dataset, "train")
print_random_sampling_rates(commonvoice_dataset, "test")



Sampling rates of 3 random files from 'train' set:
Sample 5003: 16000 Hz
Sample 2440: 16000 Hz
Sample 5144: 16000 Hz

Sampling rates of 3 random files from 'test' set:
Sample 1293: 16000 Hz
Sample 450: 16000 Hz
Sample 2008: 16000 Hz


## WER on train and test sets before fine tuning

In [28]:
import torch
import evaluate
from tqdm import tqdm

# Load WER metric
wer_metric = evaluate.load("wer")

# Function to transcribe audio
def transcribe_audio(batch):
    audio = batch["audio"]["array"]  # Get audio waveform
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")  # Process audio
    input_features = inputs.input_features.to("cuda")  # Move to GPU

    # Generate transcription
    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    # Decode predictions
    transcription = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return {"transcription": transcription}

# Apply transcription function to train and test sets
commonvoice_dataset = commonvoice_dataset.map(transcribe_audio)

# Compute WER
def compute_wer(dataset):
    references = [x["sentence"] for x in dataset]  # Ground truth
    predictions = [x["transcription"] for x in dataset]  # Model output
    wer = wer_metric.compute(predictions=predictions, references=references)
    return wer

# Compute WER for train and test sets
train_wer = compute_wer(commonvoice_dataset["train"])
test_wer = compute_wer(commonvoice_dataset["test"])

print(f"✅ WER on Train Set: {train_wer:.2%}")
print(f"✅ WER on Test Set: {test_wer:.2%}")


Map:   0%|          | 0/7563 [00:00<?, ? examples/s]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Map:   0%|          | 0/3337 [00:00<?, ? examples/s]

✅ WER on Train Set: 68.92%
✅ WER on Test Set: 71.67%


## Actual vs Predicted transcription for 5 random audio samples from test

In [61]:
import torch

# Function to transcribe audio using Whisper model
def transcribe_audio(audio_array, sampling_rate):
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")  # Process audio
    input_features = inputs.input_features.to("cuda")  # Move to GPU

    # Generate transcription
    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    # Decode predictions
    transcription = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

# 🎧 Compare Actual vs. Predicted for First 5 Samples in Test Set
print("\n✅ Comparing Actual vs. Predicted Transcriptions (First 20 Samples)\n")

for idx in range(min(20, len(commonvoice_dataset["test"]))):  # Ensure we don't exceed dataset size
    sample = commonvoice_dataset["test"][idx]

    # Transcribe using Whisper
    predicted_transcription = transcribe_audio(sample["audio"]["array"], sample["audio"]["sampling_rate"])

    print(f"🎧 **Sample {idx+1}:**")
    print(f"📝 **Actual   :** {sample['sentence']}")
    print(f"🤖 **Predicted:** {predicted_transcription}")
    print("-" * 80)



✅ Comparing Actual vs. Predicted Transcriptions (First 20 Samples)

🎧 **Sample 1:**
📝 **Actual   :** अब रामपुर में अखिलेश बांटेंगे लैपटॉप का 'लॉलीपॉप'
🤖 **Predicted:**  अब राम्पुर में अकिलेश बातेंगे लैप्टप का लोलीपॉप
--------------------------------------------------------------------------------
🎧 **Sample 2:**
📝 **Actual   :** Flipkart: बंपर ऑफर्स के साथ बिक रहा है Lenovo का ये शानदार स्मार्टफोन
🤖 **Predicted:**  Flipkart, Bumper offer के साथ बिग रहा है, Lenovo का ये शानदार smartphone
--------------------------------------------------------------------------------
🎧 **Sample 3:**
📝 **Actual   :** मैं मुसीबत में पड़ गया।
🤖 **Predicted:**  میں مسیوت میں پڑ گیا
--------------------------------------------------------------------------------
🎧 **Sample 4:**
📝 **Actual   :** सुशील मोदी है 'अफवाह मियां', बिगड़ चुका है मानसिक संतुलन: तेजस्वी यादव
🤖 **Predicted:**  शुशिल मोडि है अख्वा मियां बीगर चुका है मनसी सनकुलं ते जस्पी आदा
---------------------------------------------------------------

## Extract log-mel features and tokenize the transcriptions(for both train and test)

In [12]:
from transformers import WhisperFeatureExtractor

# Load the Whisper feature extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large")

def extract_features_and_encode(batch):
    # Extract log-Mel spectrogram features
    batch["input_features"] = feature_extractor(batch["audio"]["array"], sampling_rate=16000).input_features[0]

    # Encode target transcriptions
    batch["labels"] = tokenizer(batch["sentence"]).input_ids

    return batch

# Apply the function to the dataset
commonvoice_dataset = commonvoice_dataset.map(extract_features_and_encode, num_proc=1)

print("✅ Feature extraction and tokenization complete!")


Map:   0%|          | 0/7563 [00:00<?, ? examples/s]

Map:   0%|          | 0/3337 [00:00<?, ? examples/s]

✅ Feature extraction and tokenization complete!


{
    'audio': {
        'array': numpy_array, 
        'sampling_rate': 16000
    },
    'sentence': 'This is the transcription of the audio.',
    'input_features': tensor_of_log_mel_spectrogram,
    'labels': [tokenized_ids]
}


Current dataset looks like above.

In [15]:
# Print dataset format after feature extraction
print(commonvoice_dataset)

# Print a sample entry from the dataset (first example from train set)
print(commonvoice_dataset["train"][0])


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



## Change it to
Dataset({
    features: ['input_features', 'labels'],
    num_rows: 6760
})

In [24]:
# Remove unnecessary columns
commonvoice_dataset = commonvoice_dataset.remove_columns(["audio", "sentence"])
#commonvoice_dataset = commonvoice_dataset.remove_columns(["transcription"])

# Print dataset structure
print(commonvoice_dataset)


DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 7563
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 3337
    })
})


### input_features → Tensor of shape (batch_size, 80, time_steps)

80 is the number of Mel frequency bins (Whisper feature size)

time_steps varies based on the longest audio in the batch (others are padded)

### labels → Tensor of shape (batch_size, label_length)

The tokenized transcription sequences

Padded to the longest sequence in the batch

Padding tokens replaced with -100 (ignored during loss computation)

In [27]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [29]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

### Evaluation metrics

In [32]:
import evaluate

metric = evaluate.load("wer")

## Post-processing on the model

To reduce our models memory footprint, we load the model in 8bit, this means we quantize the model to use 1/4th precision (when comapared to float32) with minimal loss to performance. Finally, we need to apply some post-processing steps on the 8-bit model to enable training. We do so by first freezing all the model layers, and then cast the layer-norm and the output layer in float32 for training and model stability. Since the Whisper model uses Convolutional layers in the Encoder, checkpointing disables grad computation to avoid this we specifically need to make the inputs trainable.

In [34]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large").to('cuda')


In [119]:
pip install peft


Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.0.1-py3-none-any.whl.metadata (19 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
Downloading accelerate-1.0.1-py3-none-any.whl (330 kB)
Installing collected packages: accelerate, peft
Successfully installed accelerate-1.0.1 peft-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [121]:
pip install bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-win_amd64.whl.metadata (5.1 kB)
Downloading bitsandbytes-0.45.4-py3-none-win_amd64.whl (75.4 MB)
   ---------------------------------------- 0.0/75.4 MB ? eta -:--:--
    --------------------------------------- 1.0/75.4 MB 6.3 MB/s eta 0:00:12
   - -------------------------------------- 3.4/75.4 MB 9.2 MB/s eta 0:00:08
   --- ------------------------------------ 6.0/75.4 MB 10.0 MB/s eta 0:00:07
   ---- ----------------------------------- 8.4/75.4 MB 10.4 MB/s eta 0:00:07
   ----- ---------------------------------- 10.5/75.4 MB 10.2 MB/s eta 0:00:07
   ------ --------------------------------- 11.8/75.4 MB 9.6 MB/s eta 0:00:07
   ------- -------------------------------- 13.4/75.4 MB 9.0 MB/s eta 0:00:07
   ------- -------------------------------- 14.7/75.4 MB 8.8 MB/s eta 0:00:07
   -------- ------------------------------- 16.3/75.4 MB 8.5 MB/s eta 0:00:07
   --------- ------------------------------ 18.1/75.4 MB 8.5 MB/s

In [125]:
pip install --upgrade peft


Note: you may need to restart the kernel to use updated packages.


In [133]:
pip install "accelerate>=0.26.0"


Note: you may need to restart the kernel to use updated packages.


In [139]:
import accelerate
print(accelerate.__version__)


1.0.1


In [60]:
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Instead of 8-bit
    bnb_4bit_compute_dtype="float16",  # Ensure compatibility
    bnb_4bit_use_double_quant=True  # Optional: Helps reduce memory usage
)

model = WhisperForConditionalGeneration.from_pretrained(
    "openai/whisper-large",
    quantization_config=bnb_config,
    device_map="auto"
)

model = prepare_model_for_kbit_training(model)


In [62]:
def make_inputs_require_grad(module, input, output):
    output.requires_grad_(True)

model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad)

<torch.utils.hooks.RemovableHandle at 0x25519f7d520>

## Apply Low-rank adapters (LoRA) to the model

In [48]:
pip install --upgrade bitsandbytes transformers peft accelerate


Note: you may need to restart the kernel to use updated packages.


In [64]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 15,728,640 || all params: 1,559,033,600 || trainable%: 1.0089


We are ONLY using 1% of the total trainable parameters, thereby performing Parameter-Efficient Fine-Tuning

## Define the Training Configuration

In [70]:
!pip uninstall h5py -y
!pip install --no-cache-dir h5py


Found existing installation: h5py 3.11.0
Uninstalling h5py-3.11.0:
  Successfully uninstalled h5py-3.11.0
Collecting h5py
  Downloading h5py-3.11.0-cp38-cp38-win_amd64.whl.metadata (2.5 kB)
Downloading h5py-3.11.0-cp38-cp38-win_amd64.whl (3.0 MB)
   ---------------------------------------- 0.0/3.0 MB ? eta -:--:--
   ------------------------ --------------- 1.8/3.0 MB 11.2 MB/s eta 0:00:01
   ---------------------------------------- 3.0/3.0 MB 11.6 MB/s eta 0:00:00
Installing collected packages: h5py
Successfully installed h5py-3.11.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.16.1 requires flatbuffers>=23.5.26, but you have flatbuffers 1.12 which is incompatible.
tensorflow-intel 2.16.1 requires keras>=3.0.0, but you have keras 2.9.0 which is incompatible.
tensorflow-intel 2.16.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.
tensorflow-intel 2.16.1 requires tensorboard<2.17,>=2.16, but you have tensorboard 2.9.1 which is incompatible.


In [71]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=r"C:\Users\WORKSTATIONS\Desktop\BijoyashreeDas\WHISPER",  
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  
    learning_rate=1e-3,
    warmup_steps=50,
    num_train_epochs=1,
    evaluation_strategy="steps",
    fp16=True,
    per_device_eval_batch_size=8,
    generation_max_length=128,
    logging_steps=100,
    #max_steps=100,  # only for testing purposes, remove this in final run
    remove_unused_columns=False,  
    label_names=["labels"],  
)


## Train the model->save the adapter weights and trained model

In [77]:
import os
from transformers import Seq2SeqTrainer, TrainerCallback, TrainerState, TrainerControl, TrainingArguments
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

# Define save path
save_path = "C:/Users/WORKSTATIONS/Desktop/BijoyashreeDas/WHISPER"
adapter_path = os.path.join(save_path, "lora_adapter")

# Ensure save directories exist
os.makedirs(save_path, exist_ok=True)
os.makedirs(adapter_path, exist_ok=True)

# Callback to save only LoRA adapter weights
class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control

# Trainer setup
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=commonvoice_dataset["train"],
    eval_dataset=commonvoice_dataset["test"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelCallback],
)

# Disable caching for training
model.config.use_cache = False




  trainer = Seq2SeqTrainer(
max_steps is given, it will override any value given in num_train_epochs


In [79]:
# Train the model
trainer.train()



  return fn(*args, **kwargs)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
100,0.4601,0.316768


TrainOutput(global_step=100, training_loss=0.4601191711425781, metrics={'train_runtime': 910.2888, 'train_samples_per_second': 0.879, 'train_steps_per_second': 0.11, 'total_flos': 1.71665620992e+18, 'train_loss': 0.4601191711425781, 'epoch': 0.10570824524312897})

In [85]:
# Save the full fine-tuned model (Whisper + LoRA)
#model.save_pretrained(save_path)
#processor.save_pretrained(save_path)
from peft import PeftModel

# Save LoRA adapter separately
model.save_pretrained(adapter_path)


#print(f"✅ Model and weights saved at: {save_path}")
print(f"✅ LoRA adapter saved separately at: {adapter_path}")

✅ LoRA adapter saved separately at: C:/Users/WORKSTATIONS/Desktop/BijoyashreeDas/WHISPER\lora_adapter


## Print steps-train loss-test loss (Cross-Entropy Loss (CE Loss))

If you have 1,000 training samples and use batch_size=8, then you'll have:

1000/8=125 steps per epoch



In [88]:
print("Step\tTraining Loss\tValidation Loss")
for log in trainer.state.log_history:
    step = log.get("step", "N/A")
    train_loss = log.get("loss", None)  # Training loss
    val_loss = log.get("eval_loss", None)  # Validation loss

    if step != "N/A":  # Only print if it's a valid step
        train_loss_str = f"{train_loss:.6f}" if train_loss is not None else "N/A"
        val_loss_str = f"{val_loss:.6f}" if val_loss is not None else "N/A"
        print(f"{step}\t{train_loss_str}\t{val_loss_str}")


Step	Training Loss	Validation Loss
100	0.460100	N/A
100	N/A	0.316768
100	N/A	N/A


## Evaluation and Inference

 Loads the PEFT/LoRA configuration
✅ Loads the base Whisper model in 8-bit mode for efficiency
✅ Merges the fine-tuned LoRA weights with the base model
✅ Enables caching for faster inference

In [92]:
from peft import PeftModel
from transformers import WhisperForConditionalGeneration

# Define paths
base_model_path = "openai/whisper-large"  # Change if needed
adapter_path = "C:/Users/WORKSTATIONS/Desktop/BijoyashreeDas/WHISPER/lora_adapter"

# Load base Whisper model
base_model = WhisperForConditionalGeneration.from_pretrained(base_model_path).to("cuda")

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, adapter_path)

# Enable cache for inference
model.config.use_cache = True

print("✅ Model and LoRA adapter loaded successfully!")


✅ Model and LoRA adapter loaded successfully!


## Save final model

In [95]:
save_path = "C:/Users/WORKSTATIONS/Desktop/BijoyashreeDas/WHISPER/final_model"

# Save the full model with LoRA adapter
model.save_pretrained(save_path)

print(f"Model saved successfully at {save_path}")


Model saved successfully at C:/Users/WORKSTATIONS/Desktop/BijoyashreeDas/WHISPER/final_model


## Compute WER on Train and Test Sets on saved model after finetuning 

In [103]:
from transformers import WhisperProcessor

# Reload the processor from the base model and save it
processor = WhisperProcessor.from_pretrained("openai/whisper-large")  # Change to your base model
processor.save_pretrained("C:/Users/WORKSTATIONS/Desktop/BijoyashreeDas/WHISPER/final_model")


[]

In [105]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from torch.utils.data import DataLoader
import torch
import gc
import numpy as np
from tqdm import tqdm
import evaluate  # ✅ Use `evaluate` instead of `datasets.load_metric`

# Define paths
model_path = "C:/Users/WORKSTATIONS/Desktop/BijoyashreeDas/WHISPER/final_model"

# Load the model
model = WhisperForConditionalGeneration.from_pretrained(model_path).to("cuda")
processor = WhisperProcessor.from_pretrained(model_path)

# Set to eval mode
model.eval()

# Load WER metric
metric = evaluate.load("wer")  # ✅ Correct way to load the WER metric


In [107]:
def compute_wer(dataset, batch_size=8, max_new_tokens=255):
    """Generates transcriptions and computes WER for the given dataset."""
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    predictions, references = [], []

    for batch in tqdm(dataloader, desc="Evaluating WER..."):
        with torch.no_grad():
            input_features = batch["input_features"].to("cuda")

            # Generate transcription
            generated_tokens = model.generate(input_features, max_new_tokens=max_new_tokens)

            # Decode predictions and references
            decoded_preds = processor.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = processor.batch_decode(batch["labels"], skip_special_tokens=True)

            predictions.extend(decoded_preds)
            references.extend(decoded_labels)

        # Free memory
        del generated_tokens, batch
        gc.collect()

    # Compute WER
    wer = 100 * metric.compute(predictions=predictions, references=references)
    return wer


In [111]:
# Assuming 'common_voice' is your dataset
train_wer = compute_wer(commonvoice_dataset["train"])
test_wer = compute_wer(commonvoice_dataset["test"])

print(f"Train WER: {train_wer:.2f}%")
print(f"Test WER: {test_wer:.2f}%")


Evaluating WER...:   0%|                                                                       | 0/946 [00:00<?, ?it/s]Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Evaluating WER...: 100%|█████████████████████████████

Train WER: 43.76%
Test WER: 47.22%





If your dataset has 7,568 samples, then:

7568/8 = 946 batches

This means your dataset has 946 mini-batches, and each iteration processes one batch.