# Configuration


Use this section to adapt configuration around dataset, model and trainer.

In [1]:
#################################################
# log into HF to download the datasets and models
#################################################
import os
from huggingface_hub import login
from huggingface_hub import whoami

hf_token = os.environ.get("HF_TOKEN")

login(token=hf_token)
username = whoami()
print(f"You are logged in as: {username}")

In [2]:
#################
## directories
#################

LOCAL_DATA_DIR = '/tmp/datasets'
!mkdir -p {LOCAL_DATA_DIR}


BASE_DIR = '/tmp/trained_models'
!mkdir -p {BASE_DIR}


# directory for model training
OUTPUT_DIR = BASE_DIR + '/' + 'NSS_tuning_xyz'

# we have pretrained models downloaded to avoid redownloading locally
# but RunPod is often annoyingly slow in accessing local volums
# so that it might be faster to download from anew.
PRETRAINED_PREDOWNLOADED_MODELS = '/asr_train/models/pretrained_models'


In [3]:
#################
## data sets
#################

REDOWNLOAD_DATASETS = True
DATASET_TYPE = 'NSS'

ADD_AUDIO_AUGMENTATION = False

In [4]:
#################
## Model settings
#################
# available: 'openai/whisper-tiny.en', 'openai/whisper-tiny', 'openai/whisper-base', 'openai/whisper-small', 'openai/whisper-medium', 'openai/whisper-large', 'openai/whisper-large-v3', 'openai/whisper-large-v3-turbo']{}
WHISPER_MODEL_TYPE = "openai/whisper-tiny" 

# set language to what works best for Twi
LANGUAGE = 'yo'
TASK = "transcribe"

# For SS speech it is advised to update full model, for NSS probably only encoder (and maybe proj)
UPDATE_ENCODER = True
UPDATE_DECODER = False  # for non-standard speech, we typically don't update the decoder; but when training a base model for the language, decoder needs to be updated!
UPDATE_PROJ = True

# Turn on SpecAugment (standard settings)
USE_SPECAUGMENT = True


#################
## Base Model|
#################

## Either train from scratch or use other base model
# BASE_MODEL_NAME = WHISPER_MODEL_TYPE

# # or use local, pre-downloaded models unless RunPod volume access is annoyingly slow
# BASE_MODEL_NAME = PRETRAINED_PREDOWNLOADED_MODELS + '/' + WHISPER_MODEL_TYPE.replace('openai/', '')

## Or continue training on one of our previously trained base models
BASE_MODEL_NAME = 'cdli/whisper-tiny_Akan_standardspeech_spec_and_audio_augment'
# BASE_MODEL_NAME = 'cdli/whisper-small_Akan_standardspeech_spec_and_audio_augment'
# BASE_MODEL_NAME = 'cdli/whisper-large_v3_turbo_Akan_standardspeech_spec_and_audio_augment'

print('Base model will be loaded from:', BASE_MODEL_NAME)



In [5]:
####################
## Trainer settings
####################


LOGGING_STEPS = 5
# if save steps is 0, only last and best model will be written
SAVE_STEPS = 50

# training duration
MAX_EPOCHS = 5 
MAX_STEPS = 1000  # for larger datasets, you will want to increase this

# Learning Rate and LR Scheduler (LR_END and LR_DECAY_POWER only apply to polynomial)
LEARNING_RATE = 1e-4 #@param
LR_SCHEDULER_TYPE = 'polynomial' # constant_with_warmup or polynomial
LR_WARMUP_STEPS = 100
LR_END = 1e-8
LR_DECAY_POWER = 4
# see: https://huggingface.co/docs/transformers/v4.46.2/en/main_classes/optimizer_schedules#transformers.SchedulerType
# and here: https://www.kaggle.com/code/snnclsr/learning-rate-schedulers
# constant --> 'constant_with_warmup'
# polynomial --> 'get_polynomial_decay_schedule_with_warmup'


BATCH_SIZE = 32
EVAL_BATCH_SIZE = 16

#@markdown other settings relevant for evaluation
MAX_GEN_LEN = 128
EVAL_ON_START = True
EVAL_STEPS = 50

# for CPU, set both to false
USE_FP16 = True
USE_BF16 = False # only some GPUs support this, eg A100, A40

# checkpoints get huge for large models (~18 GB!)
NUM_CHECKPOINTS_TO_STORE = 2

# Imports and helper functions

In [6]:
from huggingface_hub import hf_hub_download

import random
import torchaudio
import librosa


import tarfile
import datasets
import matplotlib.pyplot as plt
import pandas as pd

import torch
import time


from dataclasses import dataclass
from typing import Any, Dict, List, Union

from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

from transformers import WhisperProcessor
from transformers import WhisperForConditionalGeneration
import os
import csv
import shutil
import numpy as np


import evaluate
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

transcript_normalizer = BasicTextNormalizer()

In [7]:
def count_trainable_parameters(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params

In [8]:
def get_wer(references, predictions, normalize=True, verbose=True):
  rs = references
  ps = predictions
  if normalize:
    ps = [transcript_normalizer(x) for x in predictions]
    rs = [transcript_normalizer(x) for x in references]
  if verbose:
    for r, p in zip(rs, ps):
      print(r)
      print(p)
      print()

  return wer_metric.compute(references=rs, predictions=ps)


def compute_metrics(pred):
    # for training metrics
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_strs = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_strs = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # calculate a per-example average
    wers = []
    cers = []
    for pred_str, label_str in zip(pred_strs, label_strs):
      p = transcript_normalizer(pred_str)
      l = transcript_normalizer(label_str)
      wer = wer_metric.compute(predictions=[p], references=[l])
      cer = cer_metric.compute(predictions=[p], references=[l])
      wers.append(wer)
      cers.append(cer)

    wer = np.mean([min(1.0,x) for x in wers])
    cer = np.mean([min(1.0,x) for x in cers])
    print('adjusted:', wer, cer)
    print('un-adjusted:', np.mean(wers), np.mean(cers))
    return {"wer": wer, "cer": cer}



In [9]:
# The following warning can be ignored:
# "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
# See: https://discuss.huggingface.co/t/finetuning-whisper-attention-mask-not-set-and-canot-be-inferred/97456
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [10]:
datasets.disable_caching()
print('cache:', datasets.is_caching_enabled())


def get_standard_speech_dataset(export_dir, also_download_augmented_audio_data=False, augmented_data_export_dir=None):
    REPO = 'cdli/akan_standard_speech_data_16khz'

    # the original audio portion
    DATA_FILE = "data.tar.gz" # the original audio portion

    tar_gz_file = hf_hub_download(
        repo_id=REPO, repo_type="dataset", filename=DATA_FILE)
    with tarfile.open(tar_gz_file, "r:gz") as tar:
        tar.extractall(export_dir)

    # the augmented audio portion
    if also_download_augmented_audio_data:
        if not augmented_data_export_dir:
            raise ValueError('Need augmented_data_export_dir if also_download_augmented_audio_data is True')
        DATA_FILE = "augmented_data.tar.gz" # the augmented audio data for train

        tar_gz_file = hf_hub_download(
            repo_id=REPO, repo_type="dataset", filename=DATA_FILE)
        with tarfile.open(tar_gz_file, "r:gz") as tar:
            tar.extractall(augmented_data_export_dir)


def get_nonstandard_speech_dataset(export_dir):
    REPO = 'cdli/akan_nonstandard_speech_data_16khz'
    DATA_FILE = "data.tar.gz"
    # previously we had a separate metadata and audio data file, now combined in one file
    # DATA_FILE = "Kumasi_Batch_16khz.tar.gz"
    # METADAT_FILE = "metadata.csv"

    tar_gz_file = hf_hub_download(
        repo_id=REPO, repo_type="dataset", filename=DATA_FILE)
    with tarfile.open(tar_gz_file, "r:gz") as tar:
        tar.extractall(export_dir)
    # rename the exracted dir to what we expect
    # os.rename(os.path.join(export_dir, 'Kumasi_Batch_16khz'),
    #           os.path.join(export_dir, 'data'))

    # # metadata
    # orig_metadata_file = hf_hub_download(
    #     repo_id=REPO, repo_type="dataset", filename=METADAT_FILE)
    # shutil.copy2(orig_metadata_file, os.path.join(export_dir, 'data', 'metadata.csv'))


# Prepare Dataset(s)

## Download dataset

In [11]:
if DATASET_TYPE == 'NSS':
    DATASET_DIR = os.path.join(LOCAL_DATA_DIR, DATASET_TYPE)
    if REDOWNLOAD_DATASETS:
            get_nonstandard_speech_dataset(DATASET_DIR)
elif DATASET_TYPE == 'SS':
    DATASET_DIR = os.path.join(LOCAL_DATA_DIR, DATASET_TYPE)
    if REDOWNLOAD_DATASETS:
        if ADD_AUDIO_AUGMENTATION:
            AUGMENTED_DATASET_DIR = os.path.join(LOCAL_DATA_DIR, DATASET_TYPE + '_augmentation')
            print('downloading Akan standard speech data with audio augmentation data: ', AUGMENTED_DATASET_DIR)
            get_standard_speech_dataset(DATASET_DIR, also_download_augmented_audio_data=True, augmented_data_export_dir=AUGMENTED_DATASET_DIR)
            print('downloaded orig datata to:', DATASET_DIR)
        else:
            print('downloading Akan standard speech data (NO audio augmentation data)')
            get_standard_speech_dataset(DATASET_DIR)
            print('downloaded orig datata to:', DATASET_DIR)
else:
    raise ValueError('Unknown dataset type:', DATASET_TYPE)

if REDOWNLOAD_DATASETS:
    print('Downloaded dataset to:', DATASET_DIR)


## Prepare dataset loading

In [12]:
# optimize settings for dataset access
datasets.disable_caching()
print('cache:', datasets.is_caching_enabled())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device is: ', device)


# IMPORTANT! need to set to 1 to avoid the mapping to hang!
torch.set_num_threads(1)
torch.get_num_threads()

num_proc = min(32, os.cpu_count())
print('# processors:', num_proc)


# Load processor
print('Using Language: ', LANGUAGE)
print('Using model:', WHISPER_MODEL_TYPE)
processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_TYPE, language=LANGUAGE, task=TASK)

# since this tokenizer isn't a FastTokenizer, so there is no point in running it with is_batched=True
# see: processor.tokenizer.is_fast
def prepare_features(example):
    example["input_features"] = processor.feature_extractor(example["audio"]["array"], sampling_rate=example["audio"]["sampling_rate"]).input_features[0]
    example["labels"] = processor.tokenizer(example["text"]).input_ids
    # also count number of tokens
    example["token_length"] = len(example["labels"])
    return example


## Make dev set

In [13]:
dev_set = datasets.load_dataset("audiofolder", data_dir=DATASET_DIR,  split='validation', streaming=False)
dev_set = dev_set.map(prepare_features, remove_columns=['audio'], writer_batch_size=1, num_proc=num_proc)
print('dev:', dev_set)


## Make train set

In [14]:
train_set = datasets.load_dataset("audiofolder", data_dir=DATASET_DIR,  split='train', streaming=False)
train_set = train_set.shuffle(seed=42).flatten_indices()
train_set = train_set.map(prepare_features, remove_columns=['audio'], writer_batch_size=1, num_proc=num_proc)

print(train_set)

## Make augmented train set

In [15]:
if ADD_AUDIO_AUGMENTATION:
    # get augmented data
    augmented_train_set = datasets.load_dataset("audiofolder", data_dir=AUGMENTED_DATASET_DIR,  split='train', streaming=False)
    augmented_train_set = augmented_train_set.map(prepare_features, remove_columns=['audio'], writer_batch_size=1, num_proc=num_proc)
    print('augmented training set size:', augmented_train_set)

    # make combine audio and shuffle so we iterate over the original and augmented dataset equally during training
    train_set = datasets.concatenate_datasets([train_set, augmented_train_set])
    train_set = train_set.shuffle(seed=42).flatten_indices()
    print('combined training set size:', train_set)

# Setup Model and Trainer for Tuning

### Model

In [16]:
base_model = WhisperForConditionalGeneration.from_pretrained(BASE_MODEL_NAME)
_ = base_model.to(device)
print('Using Language: ', LANGUAGE)
print('Using model:', WHISPER_MODEL_TYPE)


# ensure task and language for training
base_model.generation_config.language = LANGUAGE
base_model.generation_config.task = TASK
base_model.generation_config.forced_decoder_ids = None
base_model.config.forced_decoder_ids = None
# to use gradient checkpointing
base_model.config.use_cache = False
print('language set to:', base_model.generation_config.language)

In [17]:
# SpecAugment
if USE_SPECAUGMENT:
    base_model.config.apply_spec_augment = USE_SPECAUGMENT

    # Specaugment (use default settings, as per paper)

    # time masking
    base_model.config.mask_time_prob = 0.05
    base_model.config.mask_time_length = 10
    base_model.config.mask_time_min_masks = 2

    # feature masking
    base_model.config.mask_feature_prob = 0.05 # def: 0
    base_model.config.mask_feature_length = 10
    base_model.config.mask_feature_min_masks = 2 # def: 0

print('Using specaugment:', base_model.config.apply_spec_augment)


# which layers to tune
base_model.model.encoder.requires_grad_(UPDATE_ENCODER)
base_model.model.decoder.requires_grad_(UPDATE_DECODER)
base_model.proj_out.requires_grad_(UPDATE_PROJ)


print('encoder params to update/total:', count_trainable_parameters(base_model.model.encoder), base_model.model.encoder.num_parameters())
print('decoder parans to update/total:', count_trainable_parameters(base_model.model.decoder), base_model.model.decoder.num_parameters())

print('overall # trainable parameters:', count_trainable_parameters(base_model))
print('.   overall # model parameters:', base_model.model.num_parameters())

### Trainer

In [24]:
# Training Hyper Parameters

LOG_DIR = os.path.join(OUTPUT_DIR, 'logs')

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    logging_dir=LOG_DIR,
    logging_steps=LOGGING_STEPS,
    report_to=["tensorboard"],
    include_num_input_tokens_seen=True,
    ### on GPU, can either do fp16 or bf16 depending on specific GPU
    fp16=USE_FP16, 
    bf16=USE_BF16, 
    push_to_hub=False,
    remove_unused_columns=False,
    #
    num_train_epochs=MAX_EPOCHS,
    max_steps=MAX_STEPS,
    #
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    #
    per_device_train_batch_size=BATCH_SIZE,
    #
    eval_on_start=EVAL_ON_START,
    predict_with_generate=True,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    eval_steps=EVAL_STEPS,
    evaluation_strategy="steps",
    generation_max_length=MAX_GEN_LEN,
    # eval_accumulation_steps=8, # -- no mem issues right no, so we skip this
    #
    metric_for_best_model="wer",
    greater_is_better=False,
    #
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    #
    # only applies to polynomial schedule (constant ignores args)
    lr_scheduler_kwargs={
        "lr_end": LR_END, # The final LR.  Crucial for polynomial decay.
        "power": LR_DECAY_POWER, # for decay
        # we don't need to set the other arguments as they are already set in the args outside
        #"num_warmup_steps": WARMUP_STEPS, # The number of steps for the warmup phase.
        #"num_training_steps": MAX_STEPS, # The total number of training steps.
        #"lr_init": 1e-5 # we take the LR setting
    },

    learning_rate=LEARNING_RATE,
    warmup_steps=LR_WARMUP_STEPS, # what happens if we have this and the LR schedule args ?
    #
    save_steps=SAVE_STEPS,
    save_strategy="steps",
    save_total_limit=NUM_CHECKPOINTS_TO_STORE,
    load_best_model_at_end=True,
    # group_by_length=True
    # auto_find_batch_size=True
)

print('trainer args set, writing to:', OUTPUT_DIR)

In [19]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=base_model.config.decoder_start_token_id,
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=base_model,
    train_dataset=train_set,
    eval_dataset=dev_set,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor
)



# Run Training

Note: tensorboard doesn't show properly in jupyter notebooks, hence running locally and
depending on environment need port forwarding or binding provided by infra (eg RunPod)

```!tensorboard  --logdir {LOGDIR} --bind_all --port 6006```

In [20]:
print('using:', OUTPUT_DIR)

In [22]:
# trainer.train(resume_from_checkpoint = True)
trainer.train()

In [23]:
# eval after training
trainer.evaluate(dev_set, language=LANGUAGE)