<a href="https://colab.research.google.com/github/beinghorizontal/wav2vec2/blob/main/finetune_crossdelenna_medium_cross_en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install required packages


In [None]:
import os
os.system('pip install datasets transformers jiwer evaluate huggingface_hub tokenizers')


In [None]:
import torch
import evaluate
import numpy as np
import random
import librosa
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import (
    WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor,
    WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
)
from google.colab import drive, output


# Enable custom widget manager


In [None]:
output.enable_custom_widget_manager()


# Check GPU availability


In [None]:
gpu_info = os.popen('nvidia-smi').read()
if 'failed' in gpu_info:
    print('Not connected to a GPU')
else:
    print(gpu_info)


# Load dataset

In [30]:
from datasets import load_dataset, DatasetDict
timit = load_dataset("crossdelenna/whisper_data_merge3")


README.md:   0%|          | 0.00/436 [00:00<?, ?B/s]

(…)-00000-of-00003-d01f13de4b26c049.parquet:   0%|          | 0.00/46.1M [00:00<?, ?B/s]

(…)-00001-of-00003-20e13de4f66220d1.parquet:   0%|          | 0.00/76.7M [00:00<?, ?B/s]

(…)-00002-of-00003-3ead1efcc61a9bce.parquet:   0%|          | 0.00/114M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1325 [00:00<?, ? examples/s]

# Split dataset


In [31]:
# num_rows = len(timit['train'])
# num_test_rows = num_rows // 7
# num_train_rows = num_rows - num_test_rows
# timit_train = timit["train"].select(range(num_train_rows))
# timit_test = timit["train"].select(range(num_test_rows))


num_rows = int(len(timit['train']))
num_test_rows = int(len(timit['train'])/7)
num_train_rows = num_rows - num_test_rows
timit_train = timit["train"].select(range(num_train_rows))
timit_test = timit["train"].shuffle(seed=42).select(range(num_test_rows))
print(f"Train dataset size: {len(timit_train)}")
print(f"Test dataset size: {len(timit_test)}")

Train dataset size: 1136
Test dataset size: 189


# Load Whisper components from Hugging Face Hub


In [32]:
# feature_extractor = WhisperFeatureExtractor.from_pretrained("crossdelenna/whisper_med_alex.en")
# tokenizer = WhisperTokenizer.from_pretrained("crossdelenna/whisper_med_alex.en", language="English", task="transcribe")
# processor = WhisperProcessor.from_pretrained("crossdelenna/whisper_med_alex.en", language="English", task="transcribe")

feature_extractor = WhisperFeatureExtractor.from_pretrained("crossdelenna/whisper_med_alex.en")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium.en", language="English", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-medium.en", language="English", task="transcribe")

model = WhisperForConditionalGeneration.from_pretrained("crossdelenna/whisper_med_alex.en")



In [9]:
!apt-get install git

# Clone your Hugging Face repository
!git clone https://huggingface.co/crossdelenna/whisper_med_alex.en

# Verify the cloned repository
import os
print("Files in the cloned repository:", os.listdir("./whisper_med_alex.en"))


Filtering content: 100% (23/23), 6.33 GiB | 13.15 MiB/sFiltering content: 100% (23/23), 6.33 GiB | 35.60 MiB/s, done.
Files in the cloned repository: ['preprocessor_config.json', 'config.json', 'last-checkpoint', 'vocab.json', 'normalizer.json', 'model.safetensors', 'tokenizer_config.json', 'README.md', 'generation_config.json', 'added_tokens.json', 'merges.txt', 'special_tokens_map.json', 'runs', '.gitattributes', 'training_args.bin', 'tokenizer.json', '.git']


# Data collator


In [10]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)


# Evaluation metric


In [11]:
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

# Freeze layers


In [33]:
def freeze_whisper_layers(model):
    for param in model.parameters():
        param.requires_grad = False

    try:
        encoder_layers = model.model.encoder.layers
        for layer in encoder_layers[-2:]:
            for param in layer.parameters():
                param.requires_grad = True
    except AttributeError:
        print("Could not access encoder layers")

    try:
        decoder_layers = model.model.decoder.layers
        for layer in decoder_layers[-2:]:
            for param in layer.parameters():
                param.requires_grad = True
    except AttributeError:
        print("Could not access decoder layers")

    try:
        model.model.encoder.layer_norm.requires_grad = True
    except AttributeError:
        print("Could not access encoder layer norm")

    try:
        model.model.decoder.layer_norm.requires_grad = True
    except AttributeError:
        print("Could not access decoder layer norm")

    for name, module in model.named_children():
        if 'proj' in name or 'head' in name or 'classifier' in name:
            for param in module.parameters():
                param.requires_grad = True

    return model

model = freeze_whisper_layers(model)

# Verify trainable parameters


In [34]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")
print(f"Percentage of trainable parameters: {trainable_params/total_params*100:.2f}%")

Total parameters: 763856896
Trainable parameters: 111888384
Percentage of trainable parameters: 14.65%


# Training arguments


In [40]:
# Define the checkpoint path
checkpoint_path = "/content/whisper_med_alex.en/last-checkpoint"  # Replace with the actual checkpoint folder name
import json
# Read the trainer_state.json file
trainer_state_file = os.path.join(checkpoint_path, "trainer_state.json")
with open(trainer_state_file, "r") as f:
    trainer_state = json.load(f)

# Extract the global step
global_step = trainer_state["global_step"]
print(f"Global step at the last checkpoint: {global_step}")

# Define the additional steps
additional_steps = 800

# Calculate the new maximum training steps
new_max_steps = global_step + additional_steps
print(f"New maximum training steps: {new_max_steps}")


Global step at the last checkpoint: 8215
New maximum training steps: 8815


In [41]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-medium.en",
    per_device_train_batch_size=24,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=10,
    max_steps=new_max_steps,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=300,
    eval_steps=300,
    logging_steps=300,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    hub_strategy="checkpoint",
    hub_model_id="crossdelenna/whisper_med_alex.en",
    hub_token='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',
    resume_from_checkpoint=True  # This will resume training from the last checkpoint
)

trainer = Seq2SeqTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=timit_train,
    eval_dataset=timit_test,
    tokenizer=processor.feature_extractor,
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

  trainer = Seq2SeqTrainer(


# Custom Seq2SeqTrainer to use sampled validation subset. Default random sample size is 300 from test data for faster evaluation at each eval_steps.


In [37]:
# Function to sample a subset of the validation data
# import random
# random_seed = random.randint(20, 100)  # You can adjust the range as needed

# def sample_validation_data(dataset, sample_size=300, seed=random_seed):
#     return dataset.shuffle(seed=seed).select(range(sample_size))

# class CustomSeq2SeqTrainer(Seq2SeqTrainer):
#     def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
#         # Generate a new random seed for each evaluation step

#         # Sample a smaller validation subset if it's an evaluation step
#         if self.state.global_step % self.args.eval_steps == 0:
#             eval_dataset = sample_validation_data(self.eval_dataset, sample_size=300, seed=42)
#         elif eval_dataset is None:
#             eval_dataset = self.eval_dataset
#         else:
#             eval_dataset = self.eval_dataset
#         # Call the parent class's evaluate method with the modified eval_dataset
#         return super().evaluate(eval_dataset=eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)

# trainer = CustomSeq2SeqTrainer(
#     model=model,
#     data_collator=data_collator,
#     args=training_args,
#     compute_metrics=compute_metrics,
#     train_dataset=timit_train,
#     eval_dataset=timit_test,
#     tokenizer=processor.feature_extractor,
# )


  trainer = CustomSeq2SeqTrainer(


# Save processor and tokenizer locally


In [42]:
processor.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)


('./whisper-medium.en/tokenizer_config.json',
 './whisper-medium.en/special_tokens_map.json',
 './whisper-medium.en/vocab.json',
 './whisper-medium.en/merges.txt',
 './whisper-medium.en/normalizer.json',
 './whisper-medium.en/added_tokens.json')

# Train model


In [None]:
checkpoint_path = "/content/whisper_med_alex.en/last-checkpoint"  # Specify the path to the checkpoint

trainer.train(resume_from_checkpoint=checkpoint_path)


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss,Validation Loss


# Push to hub


In [None]:
trainer.push_to_hub()


# Save model, processor, and tokenizer locally


In [None]:
processor.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)
feature_extractor.save_pretrained(training_args.output_dir)


# Push processor and tokenizer to the Hugging Face Hub


In [None]:
processor.push_to_hub("crossdelenna/whisper_med_alex.en", token="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", commit_message="Upload processor")
tokenizer.push_to_hub("crossdelenna/whisper_med_alex.en", token="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", commit_message="Upload tokenizer")
feature_extractor.push_to_hub("crossdelenna/medium_cross.en", token="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", commit_message="Upload feature extractor")