In [1]:
!pip install peft
!pip install bitsandbytes
!pip install accelerate

^C
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl (296 kB)
     ------------------------------------ 296.4/296.4 kB 163.5 kB/s eta 0:00:00
Collecting huggingface-hub>=0.17.0
  Downloading huggingface_hub-0.24.5-py3-none-any.whl (417 kB)
     ------------------------------------- 417.5/417.5 kB 27.2 kB/s eta 0:00:00
Collecting accelerate>=0.21.0
  Downloading accelerate-0.33.0-py3-none-any.whl (315 kB)
     ------------------------------------- 315.1/315.1 kB 40.0 kB/s eta 0:00:00
Collecting fsspec>=2023.5.0
  Downloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
     ------------------------------------- 177.6/177.6 kB 24.7 kB/s eta 0:00:00
Installing collected packages: fsspec, huggingface-hub, accelerate, peft
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2022.7.1
    Uninstalling fsspec-2022.7.1:
      Successfully uninstalled fsspec-2022.7.1
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.16.4
    Unins



^C




In [64]:
import os
import gc
import torch
import peft
import torchaudio
import accelerate
import numpy as np
import pandas as pd
from typing import Any
import bitsandbytes as bnb
from dataclasses import dataclass
from torch.utils.data import Dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, WhisperTokenizer, WhisperProcessor, DataCollatorForSeq2Seq, BitsAndBytesConfig

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3", language='en', task = "transcribe")
whisper_tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-large-v3', language = 'en', task = "transcribe")
train_dataset = torchaudio.datasets.LIBRISPEECH('/kaggle/input/librispeech-clean', url='train-clean-360', download=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [65]:
class training_dataset(Dataset) :
    def __init__(self, dataset) :
        super().__init__()
        self.data = dataset
    def __len__(self) :
        return self.data.__len__()
    
    def __getitem__(self, idx) :
        data = processor(self.data[idx][0].numpy(), sampling_rate = 16000, truncation=True, padding_size=3000, return_tensors='pt', return_attention_mask=True)
        data['labels'] = whisper_tokenizer(self.data[idx][2], padding='longest', truncation=True, max_length=100, return_tensors='pt').input_ids
        return data
    
dataset = training_dataset(train_dataset)

In [53]:
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
custom_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3", quantization_config = bnb_config, device_map='auto')
q_model = prepare_model_for_kbit_training(custom_model)

peft_config = LoraConfig(inference_mode=False, target_modules=["q_proj", "v_proj"], r=32, lora_alpha=64, lora_dropout=0.1)
final_model = get_peft_model(q_model, peft_config)

In [29]:
def apply_masking(text, mask_rate=0.07):
    mask = torch.rand(text.shape) > mask_rate
    text = text * mask
    return text

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features):
        input_features = [{"input_features": apply_masking(feature["input_features"].squeeze(0))} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"].squeeze(0)} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch
    
collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [30]:
def total_params(model):
    return sum(p.numel() for p in model.parameters())

print(f'Memory used by model: {round(final_model.get_memory_footprint()/1024/1024/1024, 2)} GB')
print(f'total number of parameters is {total_params(final_model)}')
final_model.print_trainable_parameters()

Memory used by model: 1.71 GB
total number of parameters is 1559219200
trainable params: 15,728,640 || all params: 1,559,219,200 || trainable%: 1.0088


In [51]:
from transformers import TrainerState, TrainerCallback, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: Seq2SeqTrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control

In [None]:
import warnings
warnings.filterwarnings("ignore")
training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/",
    report_to="none",
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1, 
    learning_rate=35e-6,
    warmup_steps=50,
    num_train_epochs=1,
    max_steps=3000,
    logging_steps=10,
    fp16=True,
    remove_unused_columns=False,
    label_names=["labels"],
)
trainer = Seq2SeqTrainer(args=training_args, model=final_model, train_dataset=dataset, data_collator=collator, tokenizer=processor.feature_extractor,
                        callbacks=[SavePeftModelCallback])
trainer.train()
state_dict = final_model.state_dict()
torch.save(state_dict, '/kaggle/working/model_weights.pth')

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
