### **Prepare Environment**

In [None]:
%%capture

!pip install --upgrade --quiet pip
!pip install --upgrade --quiet datasets transformers accelerate evaluate jiwer tensorboard gradio
!pip install --upgrade --quiet optuna
# !pip install --upgrade --quiet huggingface_hub

In [None]:
%%capture

# get HF token
from google.colab import userdata
userdata.get('HF_TOKEN')

# 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

In [None]:
# connect to huggingFace

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### **Load, Split and Resample Audio Dataset**

In [None]:
from datasets import Dataset, Audio, DatasetDict, load_dataset
import pandas as pd

In [None]:
# import preprocessed audio transcriptions

transcriptions_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/whispher-ak-gh-new-era/dataset/random_500/metadata-500.csv")

In [None]:
transcriptions_df.head(5)

Unnamed: 0,file_name,transcription
0,_image_0021_u192_1_1679697868381_03106.mp3,Ɔbea bi gyina kitchen a ɔkura fon wɔ ne nsam a...
1,_image_0027_u190_1_1679725603456_09436.mp3,Ɔbaa bi hyɛ ataade tuntum. Ɔtena akonnwa so. L...
2,_image_0052_u192_1_1679698531375_15774.mp3,Mmayewa soso nneɛma a wɔarape wɔ ade mu ɛde re...
3,_image_0067_u26_1_1679482591551_06277.mp3,"Iguam a adzetɔnfo wɔtsena hɔn nneɛma ekyir, na..."
4,_image_0076_u26_1_1679488708020_06278.mp3,Aberanteɛ bi a wahyehyɛ ne ho kamakama. N'afad...


In [None]:
transcriptions_df.shape

(500, 2)

In [None]:
#load custom audio files, downsample and split them for training

dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/Colab Notebooks/whispher-ak-gh-new-era/dataset/random_500/random_audio_500")
dataset = dataset["train"].train_test_split(test_size=0.2)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

Resolving data files:   0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
# view training dataset

print("train_dataset.shape: ", dataset["train"], '\n')
print("test_dataset.shape: ", dataset["test"])

train_dataset.shape:  Dataset({
    features: ['audio'],
    num_rows: 400
}) 

test_dataset.shape:  Dataset({
    features: ['audio'],
    num_rows: 100
})


In [None]:
# view sample transformed audio dataset information

dataset["train"][0]

{'audio': {'path': '/content/drive/MyDrive/Colab Notebooks/whispher-ak-gh-new-era/dataset/random_500/random_audio_500/ak_gh_image_0432_u122_1_1688827477537_00693.mp3',
  'array': array([-0.00021616, -0.00030124, -0.00034111, ..., -0.00013849,
         -0.00011364, -0.00014972]),
  'sampling_rate': 16000}}

In [None]:
# save split dataset for future use

# dataset.save_to_disk("/content/drive/MyDrive/Colab Notebooks/whispher-ak-gh-new-era/dataset/random_500/")

### **Load Whisper Feature Extractor**

In [None]:
# set up feature_extractor
from transformers import WhisperFeatureExtractor

# extracts log-mel spectogram from audio samples
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

### **Load Whisper Tokenizer**

In [None]:
from transformers import WhisperTokenizer

# maps a sequence of predicted text tokens to the actual text string
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Yoruba", task="transcribe")

### **Wrap Feature Extractor and Tokenizer in Whisper Processor**

In [None]:
from transformers import WhisperProcessor

# combine tokenizer and processor in one pipeline for simplification
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Yoruba", task="transcribe")

### **Prepare Audio and Transcription Dataset for Whisper Model**



In [None]:
# convert transcriptions pandas dataframe into a python dictionary :: {"file_name": "transcription"} for further processing

transcription_dict = pd.Series(transcriptions_df.transcription.values, index=transcriptions_df.file_name).to_dict()

In [None]:
def prepare_data(dataset):

  # load audio
  audio = dataset['audio']

  # get corresponding audio transcriptions into dataset
  audio_file_name = audio["path"].split('/')[-1]
  transcription = transcription_dict.get(audio_file_name, "Transcription not found")

  # attach transcriptions to audio
  dataset['sentence'] = transcription

  # extract log-mel spectogram features from audio array
  dataset["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

  # encode target text into label ids
  dataset['labels'] = tokenizer(transcription).input_ids
  return dataset


In [None]:
# load and convert audios log-mel spectrum, and append corresponding transcription
dataset = dataset.map(prepare_data, remove_columns=dataset.column_names["train"])

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# verify that the whisper tokenizer has been loaded properly
input_str = dataset["train"][0]["labels"]

# Convert list of token IDs to a string
input_str = tokenizer.decode(input_str, skip_special_tokens=True)
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 Mmarima a wɔbɔ twene. Nkwadaa wom, mmranteɛ wom ɛna panin baako nso wom. Wɔafura ntoma wɔ wɔn ase. Na wɔde wɔn nsa regoro twene no mu. Wɔn nyinaa y Abibifoɔ. 
Decoded w/ special:    <|startoftranscript|><|yo|><|transcribe|><|notimestamps|>Mmarima a wɔbɔ twene. Nkwadaa wom, mmranteɛ wom ɛna panin baako nso wom. Wɔafura ntoma wɔ wɔn ase. Na wɔde wɔn nsa regoro twene no mu. Wɔn nyinaa y Abibifoɔ. <|endoftext|>
Decoded w/out special: Mmarima a wɔbɔ twene. Nkwadaa wom, mmranteɛ wom ɛna panin baako nso wom. Wɔafura ntoma wɔ wɔn ase. Na wɔde wɔn nsa regoro twene no mu. Wɔn nyinaa y Abibifoɔ. 
Are equal:             True


### **Model Evaluation Setup for Whisper**

In [None]:
def compute_metrics(pred):
  # model evaluation computation
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [None]:
import evaluate

# instansiate evaluation metric
metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

## **Training Whisper with Hyperparameter Tuning**

In [None]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

In [None]:
from transformers import WhisperForConditionalGeneration,GenerationConfig

# load whisper model from pretrained checkpoint
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [None]:
# model generation configuration settings

model.generation_config.language = "yoruba"
model.generation_config.lang_to_id = "yo"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor ([`WhisperProcessor`])
            The processor used for processing the data.
        decoder_start_token_id (`int`)
            The begin-of-sentence of the decoder.
    """

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [None]:
# intialise data colator

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

# data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor, decoder_start_token_id=tokenizer.lang_code_to_id["yoruba"])

### **Hyperparameter Tuning**

In [None]:
import optuna
import numpy as np
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainerCallback, EarlyStoppingCallback

In [None]:
# finding optimal hyperparameters with Optuna

def objective(trial):
  # Define hyperparameters to tune
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
  # learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-4, log=True)
  per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16])
  warmup_steps = trial.suggest_int('warmup_steps', 0, 50)
  max_steps = trial.suggest_int('max_steps', 100, 500)
  generation_max_length = trial.suggest_int('generation_max_length', 50, 300)



  # Initialize EarlyStoppingCallback
  early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

  # Define training arguments
  training_args = Seq2SeqTrainingArguments(
      output_dir="nyarkssss/whisper-experiment",
      per_device_train_batch_size=per_device_train_batch_size,
      gradient_accumulation_steps=1,
      learning_rate=learning_rate,
      warmup_steps=warmup_steps,
      max_steps=max_steps,
      gradient_checkpointing=True,
      fp16=True,
      evaluation_strategy="steps",
      per_device_eval_batch_size=8,
      predict_with_generate=True,
      generation_max_length=generation_max_length,
      save_steps=100,
      eval_steps=100,
      logging_steps=25,
      report_to=["tensorboard"],
      load_best_model_at_end=True,
      metric_for_best_model="wer",
      greater_is_better=False,
      push_to_hub=True,
  )

  # Initialize trainer
  trainer = Seq2SeqTrainer(
      args=training_args,
      model=model,
      train_dataset=dataset["train"],
      eval_dataset=dataset["test"],
      data_collator=data_collator,
      compute_metrics=compute_metrics,
      tokenizer=processor.feature_extractor,
      callbacks=[early_stopping],  # Add early stopping callback
  )


  # Train and evaluate
  trainer.train()
  eval_results = trainer.evaluate(eval_dataset=dataset["test"])

  # Return the evaluation metric that Optuna should optimize
  return eval_results['eval_wer']  # Replace 'eval_wer' with the metric you're using


In [None]:
# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

# Print the best hyperparameters
print("Best hyperparameters:", study.best_params)

[I 2024-09-07 20:20:47,188] A new study created in memory with name: no-name-4c3f46a8-4d02-4d75-8294-4a9ded232afa
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
100,0.2386,0.855601,53.188363
200,0.048,0.933566,51.454599
300,0.013,0.95944,48.898031


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604

[I 2024-09-07 20:41:01,500] Trial 0 finished with value: 48.898031148986185 and parameters: {'learning_rate': 6.014805754497262e-05, 'per_device_train_batch_size': 8, 'warmup_steps': 0, 'max_steps': 387, 'generation_max_length': 233}. Best is trial 0 with value: 48.898031148986185.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
100,0.0018,1.019644,47.693212
200,0.0007,1.052685,47.134881
300,0.0004,1.06536,47.751984


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 

[I 2024-09-07 21:12:12,352] Trial 1 finished with value: 47.13488098736409 and parameters: {'learning_rate': 1.0245182582987703e-05, 'per_device_train_batch_size': 16, 'warmup_steps': 49, 'max_steps': 361, 'generation_max_length': 288}. Best is trial 1 with value: 47.13488098736409.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
100,0.0014,1.111006,47.28181
200,0.0004,1.130701,47.399354
300,0.0002,1.144356,47.428739


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 

[I 2024-09-07 21:42:31,023] Trial 2 finished with value: 47.281810167499266 and parameters: {'learning_rate': 1.322407207132022e-05, 'per_device_train_batch_size': 16, 'warmup_steps': 19, 'max_steps': 349, 'generation_max_length': 266}. Best is trial 1 with value: 47.13488098736409.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
100,0.001,1.141813,47.369968
200,0.0002,1.153747,48.016456
300,0.0001,1.16856,47.98707
400,0.0001,1.17763,48.016456


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 

[I 2024-09-07 22:03:59,795] Trial 3 finished with value: 47.36996767558037 and parameters: {'learning_rate': 7.481554854487547e-06, 'per_device_train_batch_size': 8, 'warmup_steps': 17, 'max_steps': 500, 'generation_max_length': 135}. Best is trial 1 with value: 47.13488098736409.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
100,0.0147,1.14335,50.10285
200,0.0028,1.149871,49.368205


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 

Step,Training Loss,Validation Loss,Wer
100,0.0147,1.14335,50.10285
200,0.0028,1.149871,49.368205
300,0.0009,1.179439,48.486629
400,0.0002,1.193579,48.516015


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 

[I 2024-09-07 22:46:06,540] Trial 4 finished with value: 48.4866294446077 and parameters: {'learning_rate': 2.737394247750295e-05, 'per_device_train_batch_size': 16, 'warmup_steps': 36, 'max_steps': 498, 'generation_max_length': 237}. Best is trial 1 with value: 47.13488098736409.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
100,0.0001,1.225933,47.98707


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 

[I 2024-09-07 22:56:18,908] Trial 5 finished with value: 47.987070232148106 and parameters: {'learning_rate': 3.2592476669850777e-06, 'per_device_train_batch_size': 8, 'warmup_steps': 33, 'max_steps': 186, 'generation_max_length': 144}. Best is trial 1 with value: 47.13488098736409.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
100,0.0019,1.284917,49.19189
200,0.0001,1.265263,48.222157


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 

[I 2024-09-07 23:16:42,828] Trial 6 finished with value: 48.222156920364384 and parameters: {'learning_rate': 1.8493993629390376e-05, 'per_device_train_batch_size': 16, 'warmup_steps': 7, 'max_steps': 225, 'generation_max_length': 131}. Best is trial 1 with value: 47.13488098736409.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
100,0.0001,1.288162,60.828681
200,0.0,1.300277,60.62298


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 

[I 2024-09-07 23:41:21,346] Trial 7 finished with value: 60.62297972377314 and parameters: {'learning_rate': 1.2751503615250094e-06, 'per_device_train_batch_size': 16, 'warmup_steps': 45, 'max_steps': 294, 'generation_max_length': 75}. Best is trial 1 with value: 47.13488098736409.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
100,0.002,1.287788,48.839259
200,0.0002,1.292332,47.781369


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 

### **Training Model**

In [None]:
# traning model with best hyperparameters

training_args = Seq2SeqTrainingArguments(
    output_dir="nyarkssss/whisper-experiment",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=50,
    max_steps=500,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)


In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


In [None]:
# train the model

trainer.train()

In [None]:
# push model to HuggingFace

kwargs = {
    "dataset_tags": "speech_data_ghana_ug",
    "dataset": "Speech Data Ghana UG - Ghanaian Multilingual Sample Data",
    "language": "ak",
    "model_name": "Whisper Small Akan",
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
    "tags": "akan-whisper-small",
}

trainer.push_to_hub(**kwargs)

In [None]:
# Push tokenizer (if applicable) to Hugging Face Hub
tokenizer.push_to_hub("nyarkssss/whisper-experiment")

### **Plotting Training Information**

In [None]:
# import matplotlib.pyplot as plt

# # Data (Trial 1 Only Yoruba data)
# steps = [250, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000]
# training_loss = [0.7803, 0.5693, 0.504, 0.463, 0.4036, 0.3496, 0.3346, 0.34, 0.3153, 0.3061, 0.2877, 0.2892]
# validation_loss = [0.9028, 0.7365, 0.6444, 0.5931, 0.5471, 0.5171, 0.4908, 0.4612, 0.438, 0.4228, 0.4164, 0.4112]
# wer_ortho = [78.2451, 85.1996, 73.1001, 78.2923, 68.4638, 73.5459, 67.8109, 70.3336, 66.8799, 67.9499, 67.9735, 68.9201]
# wer = [72.5162, 82.5191, 69.8969, 71.393, 62.0921, 71.9933, 65.7669, 65.5394, 60.1118, 60.2982, 59.7051, 61.942]

# # # Data (Trial 2 English data Included)
# # steps = [250, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000, 3250]
# # training_loss = [0.7951, 0.605, 0.5273, 0.4483, 0.4158, 0.4032, 0.3261, 0.3606, 0.3008, 0.2938, 0.2933, 0.277, 0.2849]
# # validation_loss = [0.800042, 0.640793, 0.562115, 0.51375, 0.475197, 0.443391, 0.415314, 0.391015, 0.376906, 0.360765, 0.349412, 0.340355, 0.334607]
# # wer_ortho = [60.465069, 50.317554, 45.865714, 43.434856, 41.913033, 41.186616, 40.218737, 38.065864, 36.708398, 36.298521, 35.608628, 34.54741, 34.506828]
# # wer = [45.959711, 39.312049, 35.752566, 33.890156, 32.529456, 31.624857, 30.31927, 29.104903, 27.540859, 27.09236, 27.04485, 25.568225, 25.676549]

# # Plot Step vs. Loss
# plt.figure(figsize=(12, 6))
# plt.plot(steps, training_loss, label='Training Loss', marker='o')
# plt.plot(steps, validation_loss, label='Validation Loss', marker='o')
# plt.xlabel('Step')
# plt.ylabel('Loss')
# plt.title('Step vs. Loss')
# plt.legend()
# plt.grid(True)
# plt.show()

# # Plot Step vs. WER
# plt.figure(figsize=(12, 6))
# plt.plot(steps, wer_ortho, label='WER Orthographic', marker='o')
# plt.plot(steps, wer, label='WER Normalized', marker='o')
# plt.xlabel('Step')
# plt.ylabel('WER')
# plt.title('Step vs. WER')
# plt.legend()
# plt.grid(True)
# plt.show()

# # Plot all in one graph for better comparison
# fig, ax1 = plt.subplots(figsize=(12, 6))

# color = 'tab:red'
# ax1.set_xlabel('Step')
# ax1.set_ylabel('Loss', color=color)
# ax1.plot(steps, training_loss, label='Training Loss', color='tab:blue', marker='o')
# ax1.plot(steps, validation_loss, label='Validation Loss', color='tab:orange', marker='o')
# ax1.tick_params(axis='y', labelcolor=color)
# ax1.legend(loc='upper left')
# ax1.grid(True)

# ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
# color = 'tab:green'
# ax2.set_ylabel('WER', color=color)  # we already handled the x-label with ax1
# ax2.plot(steps, wer_ortho, label='WER Orthographic', color='tab:green', marker='o')
# ax2.plot(steps, wer, label='WER Normalized', color='tab:purple', marker='o')
# ax2.tick_params(axis='y', labelcolor=color)
# ax2.legend(loc='upper right')

# fig.tight_layout()  # otherwise the right y-label is slightly clipped
# plt.title('Step vs. Loss and WER')
# plt.show()