# Fine-Tune Whisper For Multilingual ASR with 🤗 Transformers

## Prepare Environment

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Dec  9 00:41:02 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os
!pip install datasets>=2.6.1
!pip install transformers
#!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install accelerate -U


Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.5.2
Collecting gradio
  Downloading gradio-4.8.0-py3-none-any.whl (16.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.104.1-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.

In [None]:
import os

# Run to fix bug of Seq2SeqTrainer not being found
# os._exit(00)

In [None]:
from huggingface_hub import notebook_login

# Link HuggingFace in order to download dataset or to push model to repository
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Feature Pipeline

## Load Dataset

In [None]:
import os
import regex as re
from os.path import isfile, join
from datasets import concatenate_datasets, Dataset, load_dataset, DatasetDict
from google.colab import drive

DOWNLOAD = False

# Mount the drive
drive.mount('/content/drive')

# Create Dataset Dictionary
common_voice = DatasetDict()

# Get the arrow files storing training and testing data
r_train = re.compile("data-000[0-1][0-9]-of-00042.arrow")
arrow_files_train = list(filter(r_train.match, os.listdir("/content/drive/MyDrive/common_voice/train")))
r_test = re.compile("data-0000[0-1]-of-00002.arrow")
arrow_files_test = list(filter(r_test.match, os.listdir("/content/drive/MyDrive/common_voice/test")))

# Concatenate stored arrow files and assign as training and test data
if not DOWNLOAD:
  common_voice["train"] = concatenate_datasets([Dataset.from_file(join("/content/drive/MyDrive/common_voice/train", arrow_file)) for arrow_file in arrow_files_train]).select(range(10000))
  common_voice["test"] = concatenate_datasets([Dataset.from_file(join("/content/drive/MyDrive/common_voice/test", arrow_file)) for arrow_file in arrow_files_test]).select(range(5000))

# If unprocessed dataset should be downloaded from HuggingFace
if DOWNLOAD:
  common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "de", split="train+validation", use_auth_token=True)
  common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "de", split="test", use_auth_token=True)


Mounted at /content/drive


In [None]:
# Run code cell if loaded dataset should be pushed and stored to Drive
output_dir = "/content/drive/MyDrive/common_voice"
os.makedirs(output_dir, exist_ok=True)
common_voice.save_to_disk(output_dir)

## Prepare Dataset

In [None]:
# Prepare dataset for use with Whisper
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
from datasets import Audio

# Remove unneccessary features
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

# Change sampling frequency from 48kHz to 16kHz prior to using it for Whisper
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

print(common_voice)
print(common_voice["train"][0])

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 5000
    })
})


In [None]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)
print(common_voice["train"][0])

In [None]:
# Run code cell to push and store processed features to Drive

# Mount the Drive if not mounted
drive.mount('/content/drive')

# Processed features stored in 'features' folder
output_dir = "/content/drive/MyDrive/common_voice_features"
os.makedirs(output_dir, exist_ok=True)

# Save to Drive
common_voice.save_to_disk(output_dir)