In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Aug 23 23:44:47 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

**Prepare Environmaent**

In [None]:
!pip install --upgrade --quiet pip
!pip install --upgrade --quiet datasets transformers accelerate evaluate jiwer tensorboard gradio

In [None]:
# Connect to HuggingFace

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

**Finetuning Whisper**

The ASR pipeline can be de-composed into three stages:

*   A **feature extractor** which pre-processes the raw audio-inputs
*   The **model** which performs the sequence-to-sequence mapping
*   A **tokenizer** which post-processes the model outputs to text format

**Prepare Feature Extractor, Tokenizer and Data**

In [None]:
from datasets import Dataset, Audio, DatasetDict, load_dataset
import pandas as pd

In [None]:
# Read metadata.csv and convert to dict

transcriptions_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ak-gh/metadata.csv")

In [None]:
transcriptions_df.head(3)

Unnamed: 0,file_name,transcription
0,ak_gh_image_0844_u1672_1_1683153607764_17593.wav,Ingresi abrɔfo mmienu a baako yɛ ɔbaa a baako ...
1,ak_gh_image_0844_u1684_1_1683445885695_01769.wav,"Obi ne nan da ɔfofor ne nan do, na wɔtse egua ..."
2,ak_gh_image_0844_u1734_1_1683308703055_04934.wav,Ɔbaa baako ne ɔbarima baako a wɔadi dɛm. Ɔbari...


In [None]:
transcriptions_df.columns

Index(['file_name', 'transcription'], dtype='object')

In [None]:
transcription_dict = pd.Series(transcriptions_df.transcription.values, index=transcriptions_df.file_name).to_dict()

In [None]:
#Access local audio file folder

dataset_folder = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/Colab Notebooks/ak-gh/sr-ak-audio")
dataset_folder = dataset_folder["train"].train_test_split(test_size=0.2)
dataset_folder = dataset_folder.cast_column("audio", Audio(sampling_rate=16000))
print(dataset_folder["train"])
print(dataset_folder["test"])

Resolving data files:   0%|          | 0/1500 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/1500 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['audio'],
    num_rows: 1200
})
Dataset({
    features: ['audio'],
    num_rows: 300
})


In [None]:
dataset_folder["train"][0]['audio']

{'path': '/content/drive/MyDrive/Colab Notebooks/ak-gh/sr-ak-audio/ak_gh_image_0930_u886_1_1682781578830_02168.wav',
 'array': array([ 7.62939453e-04, -1.09863281e-03, -4.51660156e-03, ...,
         6.10351562e-05,  3.05175781e-05,  0.00000000e+00]),
 'sampling_rate': 16000}

In [None]:
# set up feature_extractor
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [None]:
# Set up Whisper Tokenizer
from transformers import WhisperTokenizer

#  "yo": "yoruba",
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="yoruba", task="transcribe")

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [None]:
# Set up Whisper Processor to wrap both the Tokenizer and Feature Extractor

from transformers import WhisperProcessor


processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="yoruba", task="transcribe")

In [None]:
# Call batch["audio"] to load the audio and convert it into a log-mel spectrum, and add a transcription field by specifying the file via path


def prepare_data(batch):

  # load and resample audio data from 48 to 16kHz
  audio = batch['audio']
  # print(audio["path"])

  # get audio transcriptions
  file_name = audio["path"].split('/')[-1]
  transcription = transcription_dict.get(file_name, "Transcription not found")

  # batch["audio_feature_orgin"] = audio

  # compute log-Mel input features from input audio array
  batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

  # encode target text to label ids
  batch['labels'] = tokenizer(transcription).input_ids
  return batch


In [None]:
dataset_folder = dataset_folder.map(prepare_data, remove_columns=dataset_folder.column_names["train"])

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
print(dataset_folder)

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 300
    })
})


**Training and Evaluation**

In [None]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

**Loading Pretrained Model Checkpoint**

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

In [None]:
model.generation_config.language = "yoruba"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None
# model.config.suppress_tokens = []

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
  processor: Any

  def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
    # split inputs and labels since they have to be of different lengths and need different padding methods
    # first treat the audio inputs by simply returning torch tensors
    input_features = [{"input_features": feature["input_features"]} for feature in features]
    batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

    # get the tokenized label sequences
    label_features = [{"input_ids": feature["labels"]} for feature in features]
    # pad the labels to max length
    labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

    # replace padding with -100 to ignore loss correctly
    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

    # if bos token is appended in previous tokenization step,
    # cut bos token here as it's append later anyways
    if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
        labels = labels[:, 1:]

    batch["labels"] = labels

    return batch


In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
  processor=processor,
  # decoder_start_token_id=model.config.decoder_start_token_id,
)

In [None]:
import evaluate
metric = evaluate.load("wer")

def compute_metrics(pred):
  pred_ids = pred.predictions
  label_ids = pred.label_ids

  # replace -100 with the pad_token_id
  label_ids[label_ids == -100] = tokenizer.pad_token_id

  # we do not want to group tokens when computing the metrics
  pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
  label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

  wer = 100 * metric.compute(predictions=pred_str, references=label_str)

  return {"wer": wer}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="nyarkssss/whisper-tiny-ak",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    fp16=True,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)



In [None]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset_folder["train"],
    eval_dataset=dataset_folder["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


max_steps is given, it will override any value given in num_train_epochs


In [None]:
processor.save_pretrained(training_args.output_dir)

[]

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
1000,0.1919,0.864111,61.919505
2000,0.0111,1.152422,64.925597
3000,0.0031,1.269873,63.727155
4000,0.0022,1.305437,66.883052


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50

TrainOutput(global_step=4000, training_loss=0.29534042640496044, metrics={'train_runtime': 8674.8444, 'train_samples_per_second': 7.378, 'train_steps_per_second': 0.461, 'total_flos': 1.57560864768e+18, 'train_loss': 0.29534042640496044, 'epoch': 53.333333333333336})

In [None]:
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


CommitInfo(commit_url='https://huggingface.co/nyarkssss/whisper-tiny-ak/commit/a15f2b7fec58923495f591fd3717406229cf5977', commit_message='End of training', commit_description='', oid='a15f2b7fec58923495f591fd3717406229cf5977', pr_url=None, pr_revision=None, pr_num=None)

**Gradio App demo**

In [None]:
from transformers import pipeline
import gradio as gr

pipe = pipeline(task="automatic-speech-recognition",model="nyarkssss/whisper-tiny-ak")

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs="text",
    title="Whisper Tiny Akan",
    description="Realtime demo for Akan speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


TypeError: Audio.__init__() got an unexpected keyword argument 'source'

In [None]:
# input_str = dataset_folder["train"][0]["labels"]
# labels = tokenizer(input_str).input_ids
# decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
# decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

# print(f"Input:                 {input_str}")
# print(f"Decoded w/ special:    {decoded_with_special}")
# print(f"Decoded w/out special: {decoded_str}")
# print(f"Are equal:             {input_str == decoded_str}")
