<a href="https://colab.research.google.com/github/beinghorizontal/wav2vec2/blob/main/finetune_whisper_medium_in.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
# import datasets
#locale.getpreferredencoding = lambda: "UTF-8"
!pip install datasets
import datasets
#timit = datasets.load_dataset("crossdelenna/whisper_data_merge2", use_auth_token='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')
#!pip install datasets
#import datasets
timit = datasets.load_dataset("crossdelenna/whisper_data_merge2")

In [None]:
num_rows = int(len(timit['train']))
num_test_rows = int(len(timit['train'])/7)
num_train_rows = num_rows - num_test_rows
timit_train = timit["train"].select(range(num_train_rows))
timit_test = timit["train"].select(range(num_test_rows))


In [None]:
timit_train

In [None]:
timit_test

In [None]:
#import locale
#locale.getpreferredencoding = lambda: "UTF-8"

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# !unzip "/content/drive/MyDrive/mycheckpoint.zip" -d "/content/"


In [None]:
%%capture
#!pip install datasets==1.18.3
!pip install git+https://github.com/huggingface/transformers
!pip install jiwer
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install bitsandbytes
!pip install accelerate


from accelerate import PartialState


In [None]:
import librosa
import re
import json
import numpy as np
import random

In [None]:
# path to upload checkpoints to drive
#from shutil import copyfile
#dst = '/usr/local/lib/python3.9/dist-packages/transformers/trainer.py'
#src = '/content/drive/MyDrive/trainer_mod_large.py'
#copyfile(src, dst)

In [None]:
%%capture
!apt install git-lfs

In [None]:
import IPython
from google.colab import output

display(IPython.display.Javascript('''
 function ClickConnect(){
   btn = document.querySelector("colab-connect-button")
   if (btn != null){
     console.log("Click colab-connect-button");
     btn.click()
     }

   btn = document.getElementById('ok')
   if (btn != null){
     console.log("Click reconnect");
     btn.click()
     }
  }

setInterval(ClickConnect,60000)
'''))

print("Done.")

### Load WhisperFeatureExtractor


In [None]:
from transformers import WhisperFeatureExtractor

#feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium.en")
feature_extractor = WhisperFeatureExtractor.from_pretrained("crossdelenna/medium_cross.en")


### Load WhisperTokenizer

In [None]:
from transformers import WhisperTokenizer
#tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium.en", language="English", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained("crossdelenna/medium_cross.en", language="English", task="transcribe")

### Combine To Create A WhisperProcessor

In [None]:
from transformers import WhisperProcessor
#processor = WhisperProcessor.from_pretrained("openai/whisper-medium.en", language="English", task="transcribe")
processor = WhisperProcessor.from_pretrained("crossdelenna/medium_cross.en", language="English", task="transcribe")

Let's print the first example of the Common Voice dataset to see
what form the data is in:

In [None]:
print(timit["train"][0])

**Note**: Currently `datasets` make use of [`torchaudio`](https://pytorch.org/audio/stable/index.html) and [`librosa`](https://librosa.org/doc/latest/index.html) for audio loading and resampling. If you wish to implement your own costumized data loading/sampling, feel free to just make use of the `"path"` column instead and disregard the `"audio"` column.

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

Next, the evaluation metric is defined. As mentioned earlier, the
predominant metric in ASR is the word error rate (WER), hence we will use it in this notebook as well.

In [None]:
import evaluate

metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
# import os
# #dics = os.listdir('/content/drive/MyDrive/wav2vec2-base-en-in/wav2vec2-base-en-in')
# dics = os.listdir('/content/wav2vec2-large-eng-ind')
# if dics[0] == '.ipynb_checkpoints':
#   dics=dics[1:]
# match = [t for t in dics if 'checkpoint' in t]
# model_url = '/content/wav2vec2-large-en-in/'+match[0]
# print(model_url)


In [None]:
from transformers import WhisperForConditionalGeneration
#model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium.en")
model = WhisperForConditionalGeneration.from_pretrained("crossdelenna/medium_cross.en")



# Freeze original layers


In [None]:
def freeze_whisper_layers(model):
    # Print out the model structure to understand its exact attributes
    print(model)

    # Freeze all parameters initially
    for param in model.parameters():
        param.requires_grad = False

    # Attempt to unfreeze specific components
    try:
        # Try to unfreeze the last few encoder layers
        encoder_layers = model.model.encoder.layers
        for layer in encoder_layers[-2:]:
            for param in layer.parameters():
                param.requires_grad = True
    except AttributeError:
        print("Could not access encoder layers")

    try:
        # Try to unfreeze the last few decoder layers
        decoder_layers = model.model.decoder.layers
        for layer in decoder_layers[-2:]:
            for param in layer.parameters():
                param.requires_grad = True
    except AttributeError:
        print("Could not access decoder layers")

    # Try to find and unfreeze final layer norms
    try:
        model.model.encoder.layer_norm.requires_grad = True
    except AttributeError:
        print("Could not access encoder layer norm")

    try:
        model.model.decoder.layer_norm.requires_grad = True
    except AttributeError:
        print("Could not access decoder layer norm")

    # Verify and unfreeze any classification or projection heads
    for name, module in model.named_children():
        if 'proj' in name or 'head' in name or 'classifier' in name:
            for param in module.parameters():
                param.requires_grad = True

    return model

# Apply freezing
model = freeze_whisper_layers(model)

# Verify trainable parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")
print(f"Percentage of trainable parameters: {trainable_params/total_params*100:.2f}%")

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [None]:
import os

# Get the list of files and directories in the "/content/whisper-small.en" folder.
#files_and_directories = os.listdir("/content/whisper-small.en")

# Filter the list to only include directories.
#directories = [f for f in files_and_directories if os.path.isdir(os.path.join("/content/whisper-small.en", f))]

# Find the directory that starts with "checkpoint-1200".
#checkpoint_directory = next((d for d in directories if d.startswith("checkpoint-1200")), None)

# Print the checkpoint directory name.
# if checkpoint_directory:
#   print(checkpoint_directory)
# else:
#   print("Checkpoint directory not found.")

In [None]:
# Add this before training
# total_params = sum(p.numel() for p in model.parameters())
# trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

# print(f"Total parameters: {total_params}")
# print(f"Trainable parameters: {trainable_params}")
# print(f"Percentage of trainable parameters: {trainable_params/total_params*100:.2f}%")

In [None]:
checkpoint_path = "crossdelenna/medium_cross.en"


In [None]:
# prompt: huggingface transformers resume from checkpoint give specific path of checkpoint
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
#maxsteps = int(checkpoint_directory.split('-')[1])+1201
maxsteps = 1051
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-medium.en",  # change to a repo name of your choice
    per_device_train_batch_size=22,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=10,
    max_steps=maxsteps,  # default is 4000 I changed to 2k so it stops training early and export model to HF repo
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,  #was working with 1 but very slow
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=350,
    eval_steps=350,
    logging_steps=350,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    hub_strategy="checkpoint",
    hub_model_id="crossdelenna/medium_cross.en",
    hub_token = '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',
    #optim="adamw_bnb_8bit",
)
trainer = Seq2SeqTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=timit_train,
    eval_dataset=timit_test,
    tokenizer=processor.feature_extractor,
)



Now, all instances can be passed to Trainer and we are ready to start training!

In [None]:
processor.save_pretrained(training_args.output_dir)

### Training

```javascript
function ConnectButton(){
    console.log("Connect pushed");
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
}
setInterval(ConnectButton,60000);
```

In [None]:
# Resume training from a specific checkpoint
#resume_from_checkpoint = f"/content/whisper-small.en/checkpoint-{maxsteps-1}"
#checkpointpath = f'/content/whisper-small.en/{checkpoint_directory}'
#checkpointpath = "/content/content/whisper-small.en/checkpoint-800"
#resume_from_checkpoint = checkpointpath


In [25]:
# prompt: huggingface how to resume training from checkpoint with path

#trainer.train(resume_from_checkpoint=resume_from_checkpoint)
trainer.train()
#trainer.train(resume_from_checkpoint=checkpoint_path)


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
350,0.664,0.399837,18.209408
700,0.4625,0.324448,16.063299


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Step,Training Loss,Validation Loss,Wer
350,0.664,0.399837,18.209408
700,0.4625,0.324448,16.063299
1050,0.3703,0.303376,15.138377


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=1051, training_loss=0.4987827196334454, metrics={'train_runtime': 8268.9241, 'train_samples_per_second': 2.796, 'train_steps_per_second': 0.127, 'total_flos': 2.358005578334208e+19, 'train_loss': 0.4987827196334454, 'epoch': 3.726950354609929})

In [None]:
#trainer.train(resume_from_checkpoint=True)
#trainer.train()


In [27]:
trainer.push_to_hub()

events.out.tfevents.1739207638.b24e50d8c658.405.0:   0%|          | 0.00/7.86k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/crossdelenna/medium_cross.en/commit/6c91f1bc8abcbb95c9020cc17ea9da639b47dcab', commit_message='End of training', commit_description='', oid='6c91f1bc8abcbb95c9020cc17ea9da639b47dcab', pr_url=None, repo_url=RepoUrl('https://huggingface.co/crossdelenna/medium_cross.en', endpoint='https://huggingface.co', repo_type='model', repo_id='crossdelenna/medium_cross.en'), pr_revision=None, pr_num=None)

In [26]:
# Save all components
model.save_pretrained(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)
feature_extractor.save_pretrained(training_args.output_dir)

['./whisper-medium.en/preprocessor_config.json']

In [34]:
from huggingface_hub import HfApi

api = HfApi()


In [36]:
import os

# Get the list of directories in your model output directory
output_dir = "/content/whisper-medium.en"
checkpoint_dirs = [d for d in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, d)) and d.startswith("checkpoint-")]

# Get the latest checkpoint directory by sorting and selecting the last one
latest_checkpoint_dir = sorted(checkpoint_dirs)[0]

# Extract the checkpoint number
checkpoint_number = latest_checkpoint_dir.split("-")[1]
print(checkpoint_number)


1050


In [37]:
api.upload_folder(
    folder_path=f"/content/whisper-medium.en/checkpoint-{checkpoint_number}",  # Replace XXX with actual checkpoint number
    repo_id="crossdelenna/medium_cross.en",
    repo_type="model",token="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"
)

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/694M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/crossdelenna/medium_cross.en/commit/f4ca35fcba58dd44c960387b1f732188f7380c8a', commit_message='Upload folder using huggingface_hub', commit_description='', oid='f4ca35fcba58dd44c960387b1f732188f7380c8a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/crossdelenna/medium_cross.en', endpoint='https://huggingface.co', repo_type='model', repo_id='crossdelenna/medium_cross.en'), pr_revision=None, pr_num=None)

In [None]:
# checkpoint_path = "crossdelenna/medium_cross.en/model.safetensors"
# trainer.train(resume_from_checkpoint=checkpoint_path)

In [None]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"
# !zip -r /content/whisper-medium.en/mycheckpoint.zip /content/whisper-medium.en/checkpoint-401/
# !mv /content/whisper-medium.en/mycheckpoint.zip /content/drive/MyDrive/
