# GPU

In [14]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Not connected to a GPU


# Imports

In [15]:
# use datasets to download and prepare our training data and transformers to load and train our Whisper model.
!pip install datasets>=2.6.1
!pip install sentencepiece
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-zxdjvmcv
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-zxdjvmcv
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
#  upload model checkpoints directly the Hugging Face Hub 

token = "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"
# import the relavant libraries for loggin in
from huggingface_hub import HfApi, HfFolder

# set api for login and save token
api=HfApi()
api.set_access_token(token)
folder = HfFolder()
folder.save_token(token)
print("login")


# from huggingface_hub import notebook_login
# notebook_login()

login




In [None]:
import pickle
import sentencepiece
import datasets
from datasets import Audio
from datasets import Dataset
from datasets import load_dataset
from datasets import DownloadConfig
from datasets import Features

# Load Data

 ## Load WhisperFeatureExtractor
 load feature extractor from the pre-trained checkpoint with default values

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")

## Load WhisperTokenizer

Whisper model outputs a sequence of token ids. 

The tokenizer maps each of these token ids to their corresponding text string. 

We will load the pre-trained tokenizer and use it for fine-tuning without any further modifications.

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="English", task="transcribe")

## Combine To Create A WhisperProcessor

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="English", task="transcribe")

## Load DataSet from Hub

In [None]:
from datasets import load_dataset
from datasets import DownloadConfig
link = "DTU54DL/common-accent-augmented"
common_train = load_dataset(link, download_config=DownloadConfig(delete_extracted=True))
common_train



  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'accent', 'input_features', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['sentence', 'accent', 'input_features', 'labels'],
        num_rows: 451
    })
})

In [None]:
link = "DTU54DL/common-native"
common_test = load_dataset(link, download_config=DownloadConfig(delete_extracted=True))
common_test



  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'accent'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['audio', 'sentence', 'accent'],
        num_rows: 994
    })
})

# Training and Evaluation

We'll follow these steps:

* Define **data collator**: data collator takes pre-processed data and prepares PyTorch tensors ready for the model.
* **Evaluation metrics**: during evaluation, we evaluate the model using WER metric. We need to define a compute_metrics function that handles this computation.
* **Load pre-trained checkpoint**: load a pre-trained checkpoint and configure it correctly for training
* Define **training configuration**: this will be used by **Trainer** to define the training schedule.

After tuning the model, we evaluate it on test data to verify that we have correctly trained it to transcribe speech.

## Define Data Collator

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

Initialise the defined data collator :

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Evaluation Metrics

In [None]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Define function that takes model predictions and returns the WER metric.

* It first replaces -100 with the pad_token_id in the label_ids (undoing the step we applied in the data collator to ignore padded tokens correctly in the loss).

* It then decodes the predicted and label ids to strings. 

* Finally, it computes the WER between the predictions and reference labels:

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Load a Pre-Trained Checkpoint 

In [None]:
# load the pre-trained Whisper tiny checkpoint.
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

Downloading:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/151M [00:00<?, ?B/s]

Override generation arguments - no tokens are forced as decoder outputs (see forced_decoder_ids), no tokens are suppressed during generation (see suppress_tokens):

In [None]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

## Define Training Configuration

**Final step**: define all parameters related to training.

In [None]:
data_train = common_train['train']
print(data_train)

data_test = common_test['test'] 
print(data_test)

Dataset({
    features: ['sentence', 'accent', 'input_features', 'labels'],
    num_rows: 10000
})
Dataset({
    features: ['audio', 'sentence', 'accent'],
    num_rows: 994
})


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-comAccent-vs-comNative",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=8,  # increase by 2x for every 2x decrease in batch size
    learning_rate=5 *1e-4,
    warmup_steps=5,
    max_steps=1000,
    gradient_checkpointing=True,
    fp16=True,
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=250, 
    eval_steps=50,
    logging_steps=50,
    optim = "adamw_hf",
    adam_beta1 = 0.9, # The beta1 hyperparameter for the AdamW optimizer.
    adam_beta2 = 0.98, # The beta2 hyperparameter for the AdamW optimizer.
    weight_decay = 0.1, # The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer.
    lr_scheduler_type  = "linear",
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    seed = 42,
    data_seed=42,
    push_to_hub=True,
)

**Note**: if one does not want to upload the model checkpoints to the Hub, set push_to_hub=False.

Forward training arguments to Trainer along with model,
dataset, data collator and `compute_metrics` function

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=data_train,
    eval_dataset=data_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

Cloning https://huggingface.co/bgstud/whisper-tiny-comAccent-vs-comNative into local empty directory.
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


## Training

Training will take approx 5-10 hours depending on GPU / the one allocated to this Google Colab. If using this Google Colab directly to fine-tune a Whisper model, you should make sure that training isn't interrupted due to inactivity. 

Simple workaround to prevent this is to paste the following code into the console of this tab (right mouse click -> inspect -> Console tab -> insert code).

In [None]:
trainer.train()
# torch.cuda.empty_cache() 

The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: sentence, accent. If sentence, accent are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.


In [None]:
# trainer.push_to_hub()

In [None]:
# from transformers import WhisperForConditionalGeneration, WhisperProcessor

# link_checkpoint = "bgstud/whisper-tiny-libirClean-vs-commonNative-en"
# model = WhisperForConditionalGeneration.from_pretrained(link_checkpoint)
# processor = WhisperProcessor.from_pretrained(link_checkpoint)

In [None]:
# from transformers import pipeline
# import gradio as gr

# checkpoint = "bgstud/whisper-tiny-libirClean-vs-commonNative-en"
# pipe = pipeline(model=checkpoint, use_fast=False)  # change to "your-username/the-name-you-picked"

# def transcribe(audio):
#     text = pipe(audio)["text"]
#     return text

# iface = gr.Interface(
#     fn=transcribe, 
#     inputs=gr.Audio(source="microphone", type="filepath"), 
#     outputs="text",
#     title="whisper-tiny-en",
#     description="Realtime demo for Hindi speech recognition using a fine-tuned Whisper tiny model.",
# )

# iface.launch()

# Type Here


In [None]:
# hdas
# Write here: hdisadsaio
# 10:48
# 11:50
# 12:50
# 13:50




In [19]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor

link = "CristianaLazar/whisper-tiny-laugm-lclean"
model = WhisperForConditionalGeneration.from_pretrained(link)
processor = WhisperProcessor.from_pretrained(link)

Downloading:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/151M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/862 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/999k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [20]:
from transformers import pipeline
import gradio as gr

pipe = pipeline(model="CristianaLazar/whisper-tiny-laugm-lclean")  # change to "your-username/the-name-you-picked"

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(source="microphone", type="filepath"), 
    outputs="text",
    title="Whisper Small Hindi",
    description="Realtime demo for Hindi speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()


Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

