# Audio Transformers - Automatic Speech Recognition (ASR)

By Alberto Valdés

**Mail 1:** anvaldes@uc.cl

**Mail 2:** alberto.valdes.gonzalez.96@gmail.com

This notebook was executed in Google Colab using a A100-GPU.

# OpenAI - Whisper

### Start of execution

In [1]:
import time

In [2]:
start = time.time()

### 1. Setting the environment

In [3]:
!pip install -q datasets==2.20.0

In [4]:
!pip install -q bitsandbytes==0.43.1

### 2. Import Libraries

In [5]:
import torch
import pandas as pd
from datasets import Audio
from functools import partial
from google.colab import userdata
from dataclasses import dataclass
from transformers import Seq2SeqTrainer
from typing import Any, Dict, List, Union
from transformers import WhisperProcessor
from datasets import load_dataset, DatasetDict
from transformers import Seq2SeqTrainingArguments
from IPython.display import Audio as AudioDisplay
from transformers import WhisperForConditionalGeneration
from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

### 3. Functions

In [6]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        audio=audio["array"],
        sampling_rate=audio["sampling_rate"],
        text=example["sentence"],
    )

    # compute input length of audio sample in seconds
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]

    return example

In [7]:
def is_audio_in_length_range(length):
    return length < max_input_length

In [8]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [
            {"input_features": feature["input_features"][0]} for feature in features
        ]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [9]:
def split_n_gram(text, n):

  all_n_grams = []

  N = int(len(text) - n + 1)

  for i in range(N):
    all_n_grams.append(text[i: i + n])

  return all_n_grams

In [10]:
def score_sim(text_1, text_2, n):

  text_1_l = split_n_gram(text_1, n)
  text_2_l = split_n_gram(text_2, n)

  #--------------------------------------------------

  s_1 = 0

  for t_1 in text_1_l:
    if t_1 in text_2_l:
      s_1 = s_1 + 1

  #--------------------------------------------------

  s_2 = 0

  for t_2 in text_2_l:
    if t_2 in text_1_l:
      s_2 = s_2 + 1

  #--------------------------------------------------

  if len(text_1_l) == 0:
    s_1 = 0
  else:
    s_1 = s_1/len(text_1_l)

  #--------------------------------------------------

  if len(text_2_l) == 0:
    s_2 = 0
  else:
    s_2 = s_2/len(text_2_l)

  #--------------------------------------------------

  s = (s_1 + s_2)/2

  #--------------------------------------------------

  return s

In [11]:
def predict_text(sample):

  input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
  input_features = input_features.to('cuda')
  predicted_ids = model.generate(input_features)

  prediction = processor.batch_decode(predicted_ids, skip_special_tokens = True)[0]

  return prediction

### 4. Preparation

In [12]:
max_input_length = 30.0

In [13]:
from huggingface_hub import notebook_login

In [14]:
HUGGING_FACE_TOKEN = userdata.get('HUGGING_FACE_TOKEN')

In [15]:
HUGGING_FACE_TOKEN

'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'

In [16]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### 5. Load datasets

In [17]:
common_voice = DatasetDict()

common_voice["train"] = load_dataset(
    "mozilla-foundation/common_voice_13_0", "dv", split="train+validation"
)

common_voice["test"] = load_dataset(
    "mozilla-foundation/common_voice_13_0", "dv", split="test"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [18]:
AudioDisplay(common_voice["train"][0]['audio']['path'])

In [19]:
common_voice = common_voice.select_columns(["audio", "sentence"])

### 6. Processor

In [20]:
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small", language="sinhalese", task="transcribe"
)

In [21]:
sampling_rate = processor.feature_extractor.sampling_rate
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [22]:
common_voice = common_voice.map(
    prepare_dataset, remove_columns = common_voice.column_names["train"], num_proc = 1
)

In [23]:
common_voice["train"] = common_voice["train"].filter(
    is_audio_in_length_range,
    input_columns=["input_length"],
)

### 7. Data Collator

In [24]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor = processor)

### 8. Model

In [25]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [26]:
# disable cache during training since it's incompatible with gradient checkpointing
model.config.use_cache = False

In [27]:
# set language and task for generation and re-enable cache
model.generate = partial(
    model.generate, language="sinhalese", task="transcribe", use_cache=True
)

### 9. Training

In [28]:
num_epochs = 3

In [29]:
training_args = Seq2SeqTrainingArguments(
    output_dir = "logs",  # name on the HF Hub
    num_train_epochs = num_epochs,
    per_device_train_batch_size = 16,
    gradient_accumulation_steps = 1,  # increase by 2x for every 2x decrease in batch size
    learning_rate = 1e-5,
    lr_scheduler_type = "constant_with_warmup",
    max_steps = -1,  # increase to 4000 if you have your own GPU or a Colab paid plan
    fp16 = True,
    fp16_full_eval = True,
    evaluation_strategy = "epoch",
    per_device_eval_batch_size = 16,
    generation_max_length=225,
    save_steps = 0,
    logging_steps = 25,
    report_to = ["tensorboard"]
)



In [30]:
trainer = Seq2SeqTrainer(
    args = training_args,
    model = model,
    train_dataset = common_voice["train"],
    eval_dataset = common_voice["test"],
    data_collator = data_collator,
    tokenizer = processor
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1719,0.19658
2,0.115,0.155058
3,0.0698,0.153089


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


TrainOutput(global_step=921, training_loss=0.39800833219037385, metrics={'train_runtime': 2600.3251, 'train_samples_per_second': 5.658, 'train_steps_per_second': 0.354, 'total_flos': 4.24566840950784e+18, 'train_loss': 0.39800833219037385, 'epoch': 3.0})

### 10. Inference

In [32]:
model.config.forced_decoder_ids = None

In [33]:
ds_train = load_dataset(
    "mozilla-foundation/common_voice_13_0", "dv", split = "train+validation"
)

In [34]:
ds_train = ds_train.cast_column("audio", Audio(sampling_rate = 16_000))

In [35]:
ds_test = load_dataset(
    "mozilla-foundation/common_voice_13_0", "dv", split = "test"
)

In [36]:
ds_test = ds_test.cast_column("audio", Audio(sampling_rate = 16_000))

In [37]:
N_train = int(len(ds_train)/10)
N_test = int(len(ds_test)/10)

In [38]:
N_train, N_test

(490, 221)

In [39]:
y_train = []

for i in range(N_train):

  y_train.append(ds_train[i]['sentence'])

y_train = pd.Series(y_train)

In [40]:
y_test = []

for i in range(N_test):

  y_test.append(ds_test[i]['sentence'])

y_test = pd.Series(y_test)

In [41]:
y_pred_train = []

for i in range(N_train):

  sample = ds_train[i]["audio"]

  y_pred_train.append(predict_text(sample))

y_pred_train = pd.Series(y_pred_train)

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [42]:
y_pred_test = []

for i in range(N_test):

  sample = ds_test[i]["audio"]

  y_pred_test.append(predict_text(sample))

y_pred_test = pd.Series(y_pred_test)

### 11. Performance

In [43]:
n_gram = 2

In [44]:
score_train = []

for i in range(N_train):

  p_1 = y_train[i]
  p_2 = y_pred_train[i]

  score = score_sim(p_1, p_2, n_gram)

  score_train.append(score)

score_train = pd.Series(score_train)

In [45]:
round(score_train.mean()*100, 2)

96.13

In [46]:
score_test = []

for i in range(N_test):

  p_1 = y_test[i]
  p_2 = y_pred_test[i]

  score = score_sim(p_1, p_2, n_gram)

  score_test.append(score)

score_test = pd.Series(score_test)

In [47]:
round(score_test.mean()*100, 2)

84.54

### End of execution

In [48]:
end = time.time()

delta = (end - start)

hours = int(delta/3_600)
mins = int((delta - hours*3_600)/60)
secs = int(delta - hours*3_600 - mins*60)

print(f'Hours: {hours}, Minutes: {mins}, Seconds: {secs}')

Hours: 1, Minutes: 6, Seconds: 47
