In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:5 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:6 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Ign:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:10 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease [15.9 kB]
Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:13 http://ppa.launchpad.net

In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install optuna
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
from datasets import load_dataset, DatasetDict

In [None]:
cc2 = DatasetDict.load_from_disk("gdrive/MyDrive/common_voice")

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")

Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

In [None]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Swedish", task="transcribe")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
# from transformers import WhisperForConditionalGeneration

# model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [None]:
# pip install optuna

In [None]:
# model.config.forced_decoder_ids = None
# model.config.suppress_tokens = []

In [None]:
# from transformers import Seq2SeqTrainingArguments

# training_args = Seq2SeqTrainingArguments(
#     num_train_epochs=1,
#     output_dir="gdrive/MyDrive/School_Stuff/School_Stuff/Masters/Scalable_ML/Lab2/model/",  # change to a repo name of your choice
#     per_device_train_batch_size=16,
#     gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
#     warmup_steps=500,
#     max_steps=500,
#     gradient_checkpointing=False,
#     fp16=True,
#     evaluation_strategy="steps",
#     per_device_eval_batch_size=8,
#     predict_with_generate=True,
#     generation_max_length=225,
#     logging_steps=25,
#     report_to=["tensorboard"],
#     load_best_model_at_end=True,
#     metric_for_best_model="wer",
#     greater_is_better=False,
#     push_to_hub=False,
# )

In [None]:
# from transformers import Seq2SeqTrainer

# trainer = Seq2SeqTrainer(
#     args=training_args,
#     model=model,
#     train_dataset=cc2["train"],
#     eval_dataset=cc2["test"],
#     model_init=model_init,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
#     tokenizer=processor.feature_extractor,
# )

In [None]:
import optuna
from transformers import Seq2SeqTrainingArguments
from transformers import WhisperForConditionalGeneration
from transformers import Seq2SeqTrainer


def objective(trial: optuna.Trial):
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    model.config.suppress_tokens = []
    training_args = Seq2SeqTrainingArguments(
      num_train_epochs=1,
      output_dir="gdrive/MyDrive/output_models/",  # change to a repo name of your choice
      learning_rate=trial.suggest_loguniform("learning_rate", low=4e-5, high=0.01),
      weight_decay=trial.suggest_loguniform("weight_decay", 4e-5, 0.01),
      optim=trial.suggest_categorical("optim", ["adamw_hf", "adafactor"]),
      per_device_train_batch_size=1,
      gradient_accumulation_steps=16,  # increase by 2x for every 2x decrease in batch size
      warmup_steps=50,
      max_steps=300,
      gradient_checkpointing=False,
      evaluation_strategy="epoch",
      save_strategy="epoch",
      fp16=True,
      per_device_eval_batch_size=8,
      predict_with_generate=True,
      generation_max_length=225,
      logging_steps=25,
      report_to=["tensorboard"],
      load_best_model_at_end=True,
      metric_for_best_model="wer",
      greater_is_better=False,
      push_to_hub=False,
  )
    
    trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=cc2["train"],
    eval_dataset=cc2["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    )
  
    result = trainer.train()
    return result.training_loss

In [None]:
study = optuna.create_study(study_name="hyper-parameter-search", direction="minimize")
study.optimize(func=objective, n_trials=4)
print(study.best_value)
print(study.best_params)
print(study.best_trial)

[32m[I 2022-12-08 05:23:52,216][0m A new study created in memory with name: hyper-parameter-search[0m


Downloading:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/967M [00:00<?, ?B/s]

  learning_rate=trial.suggest_loguniform("learning_rate", low=4e-5, high=0.01),
  weight_decay=trial.suggest_loguniform("weight_decay", 4e-5, 0.01),
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 12360
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 300
  Number of trainable parameters = 241734912


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 5069
  Batch size = 8
[33m[W 2022-12-08 05:57:47,220][0m Trial 0 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-13-9ebedd117b3e>", line 46, in objective
    result = trainer.train()
  File "/usr/local/lib/python3.8/dist-packages/transformers/trainer.py", line 1536, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.8/dist-packages/transformers/trainer.py", line 1876, in _inner_training_loop
    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
  File "/usr/local/lib/python3.8/dist-packages/transformers/trainer.py", line 2124, in _maybe_log_save_evaluate
    metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
  File "/usr/local/lib/python3.8/dist-packages/transformers/trainer_

KeyboardInterrupt: ignored

In [None]:
len(cc2["test"][:5000])