# Evaluate finetuned Whisper on Fleurs

## Reproduce eval demo on Chinese to check validity

check text tokenizer

In [None]:
from jiwer import wer
from whisper.normalizers import BasicTextNormalizer
import re
import unicodedata
import warnings
import utils
# warnings.filterwarnings('ignore')

In [None]:
from datasets import load_dataset

# yue_hant_hk for Cantonese and cmn_hans_cn for Chinese
dataset = load_dataset("google/fleurs", "cmn_hans_cn", split='test')

In [8]:
print(dataset)
# print(dataset[0])
dataset = dataset.remove_columns(['id', 'num_samples', 'path', 'gender', 'lang_id', 'language', 'lang_group_id'])
print(dataset[0])

Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 945
})
{'audio': {'path': '11338778690242874435.wav', 'array': array([ 4.17232513e-07, -5.36441803e-07,  5.96046448e-07, ...,
       -1.31678581e-03, -1.24770403e-03, -1.29789114e-03]), 'sampling_rate': 16000}, 'transcription': '1940 年 8 月 15 日 盟 军 攻 入 法 国 南 部 这 次 进 攻 被 称 为 龙 骑 兵 行 动', 'raw_transcription': '1940 年 8 月 15 日，盟军攻入法国南部，这次进攻被称为“龙骑兵行动”。'}


In [9]:
# tokenizer
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Chinese", task="transcribe")
# processor
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="Chinese", task="transcribe")

In [10]:
input_str = dataset[0]["raw_transcription"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 1940 年 8 月 15 日，盟军攻入法国南部，这次进攻被称为“龙骑兵行动”。
Decoded w/ special:    <|startoftranscript|><|zh|><|transcribe|><|notimestamps|>1940 年 8 月 15 日，盟军攻入法国南部，这次进攻被称为“龙骑兵行动”。<|endoftext|>
Decoded w/out special: 1940 年 8 月 15 日，盟军攻入法国南部，这次进攻被称为“龙骑兵行动”。
Are equal:             True


check base model on Chinese (simplified and cantonese)

In [34]:
%%time

from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to("cuda")
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
# using "zh" in model => traditional chinese whereas Fleurs uses simplified Chinese for cmn_hans_cn, and yue_hant_hk
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="zh", task="transcribe")
result = dataset.map(utils.map_to_pred, processor, model, "zh", batched=True, batch_size=1)


  0%|          | 0/945 [00:00<?, ?ba/s]

In [38]:
print("WER:", wer(result["ground_truth"], result["prediction"]))  # (truth, pred)

WER: 0.38239503090460675


In [36]:
print(result[0])

{'audio': {'path': None, 'array': array([ 0.00000000e+00, -3.05175781e-05,  0.00000000e+00, ...,
       -1.34277344e-03, -1.25122070e-03, -1.31225586e-03]), 'sampling_rate': 16000}, 'transcription': '1940 年 8 月 15 日 盟 军 攻 入 法 国 南 部 这 次 进 攻 被 称 为 龙 骑 兵 行 动', 'raw_transcription': '1940 年 8 月 15 日，盟军攻入法国南部，这次进攻被称为“龙骑兵行动”。', 'prediction': '1940 年 8 月 15 日 蒙 軍 公 路 法 國 南 部 這 次 進 攻 被 稱 為 龍 旗 冰 晴 洞', 'ground_truth': '1940 年 8 月 15 日 盟 军 攻 入 法 国 南 部 这 次 进 攻 被 称 为 龙 骑 兵 行 动'}


In [15]:
from datasets import load_dataset

# yue_hant_hk for Cantonese and cmn_hans_cn for Chinese
dataset = load_dataset("google/fleurs", "yue_hant_hk", split='test')

Found cached dataset fleurs (/home/sivan/.cache/huggingface/datasets/google___fleurs/yue_hant_hk/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a)


In [16]:
print(dataset)
# print(dataset[0])
dataset = dataset.remove_columns(
    ['id', 'num_samples', 'path', 'gender', 'lang_id', 'language', 'lang_group_id', 'raw_transcription'])
print(dataset[0])

Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 819
})
{'audio': {'path': '4705644860951940224.wav', 'array': array([ 0.        ,  0.        ,  0.        , ...,  0.00025684,
       -0.00013679, -0.00051916]), 'sampling_rate': 16000}, 'transcription': '仍 有 許 多 當 時 在 此 的 男 女 存 活 了 下 來 還 有 更 多 人 的 摯 愛 在 此 被 殺 害 或 勞 動 至 死 不 管 是 不 是 猶 太'}


In [24]:
%%time

from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to("cuda")
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
# using "zh" in model => traditional chinese whereas Fleurs uses simplified Chinese for cmn_hans_cn, and yue_hant_hk
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="zh", task="transcribe")
result = dataset.map(utils.map_to_pred, batched=True, "zh", remove_columns=['audio'], batch_size=1)


  0%|          | 0/819 [00:00<?, ?ba/s]

CPU times: user 6min 57s, sys: 14.3 s, total: 7min 11s
Wall time: 4min 34s


In [26]:
print("WER:", wer(result["transcription"], result["prediction"]))  # (truth, pred)

WER: 0.4356259136449531


In [25]:
print(result[0])

{'transcription': '仍 有 許 多 當 時 在 此 的 男 女 存 活 了 下 來 還 有 更 多 人 的 摯 愛 在 此 被 殺 害 或 勞 動 至 死 不 管 是 不 是 猶 太', 'prediction': '仍 有 許 多 當 時 在 此 的 男 女 徐 樂 了 下 來 還 有 更 多 人 的 自 愛 在 此 被 殺 害 惑 努 動 之 死 不 過 是 不 事 由 太 '}


# Reproduce all reported results

In [2]:
import utils
from transformers import Seq2SeqTrainingArguments
from datasets import load_from_disk, load_dataset
from transformers import Seq2SeqTrainer
from transformers import WhisperProcessor
from transformers import WhisperForConditionalGeneration
from transformers import WhisperTokenizer
from jiwer import wer

2022-11-18 03:25:37.658265: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-18 03:25:38.574598: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-18 03:25:40.257519: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2022-11-18 03:25:40.257685: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'li

### load datasets

In [2]:
# !gsutil -m cp -n -r gs://capstone_datasets/fleurs/preprocess/fl_ch_features /home/sivan/datasets

In [3]:
langs = ["ko_kr", "he_il", "te_in", "cmn_hans_cn"]  # token for Fleurs dataset
codes = {"te_in": 'te', "ko_kr": 'ko', "he_il": 'iw',
         "cmn_hans_cn": 'zh'}  # token for normalizer and processor decoder
languages = {"te_in": 'Telugu', "ko_kr": 'Korean', "he_il": 'Hebrew', "cmn_hans_cn": 'Chinese'}  # token for Whisper processor and tokenizer
best = {"te_in": 1000, "ko_kr": 2000, "he_il": 1000,
        "cmn_hans_cn": 1000}  # best checkpoint for each model
tests = {}
preprocess = {}
models = {}
collators = {}
processors = {}

In [4]:
for lang in langs:
    # load dataset from disk
    preprocess[lang] = load_from_disk(f"/home/sivan/datasets/fl_{lang}_features")
    tests[lang] = load_dataset("google/fleurs", lang, split='test').remove_columns(['id', 'num_samples', 'path', 'gender', 'lang_id', 'language', 'lang_group_id'])


    # print(fleurs[lang])

    # load whisper processor
    processors[lang] = WhisperProcessor.from_pretrained("openai/whisper-base", language=languages[lang],
                                                        task="transcribe")

    # initialize data collator
    collators[lang] = utils.DataCollatorSpeechSeq2SeqWithPadding(processor=processors[lang])


Found cached dataset fleurs (/home/sivan/.cache/huggingface/datasets/google___fleurs/ko_kr/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a)
Found cached dataset fleurs (/home/sivan/.cache/huggingface/datasets/google___fleurs/he_il/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a)
Found cached dataset fleurs (/home/sivan/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a)
Found cached dataset fleurs (/home/sivan/.cache/huggingface/datasets/google___fleurs/cmn_hans_cn/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a)


In [28]:
# universal training args
training_args = Seq2SeqTrainingArguments(
    output_dir="/home/sivan",
    evaluation_strategy="steps",
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    generation_max_length=225,
    logging_steps=25,
    report_to=["tensorboard"],
    metric_for_best_model="wer",
    load_best_model_at_end=True,
    greater_is_better=False,
    disable_tqdm=False,  # set false to see progress bar
)

using `logging_steps` to initialize `eval_steps` to 25
PyTorch: setting up devices


In [25]:
%%time
for lang in langs:
    print(f"Evaluation for {lang} starts...")

    # load pretrained model
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to("cuda")

    # model.config.forced_decoder_ids = None
    # model.config.suppress_tokens = []
    # model.config.use_cache = False

    # uncomment for evaluate
    model.config.forced_decoder_ids = processors[lang].get_decoder_prompt_ids(language=codes[lang], task="transcribe")

    # redefine output path
    training_args.output_dir = f"/home/sivan/whisper_base_fl_{lang}"

    # specify metric for each language
    tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language=languages[lang], task="transcribe")
    compute_metrics = utils.metrics(codes[lang], tokenizer)

    # set trainer
    trainer = Seq2SeqTrainer(
        args=training_args,
        model=model,
        # train_dataset=fleurs[lang]["train"],
        # eval_dataset=fleurs[lang]["validation"],
        data_collator=collators[lang],
        compute_metrics=compute_metrics,
        tokenizer=processors[lang].feature_extractor,
    )

    #     trainer.train()

    predict_results = trainer.predict(preprocess[lang]['test'], metric_key_prefix="test")
    metrics = predict_results.metrics
    trainer.log_metrics("test2", metrics)
    trainer.save_metrics("test2", metrics)

Evaluation for ko_kr starts...


loading configuration file config.json from cache at /home/sivan/.cache/huggingface/hub/models--openai--whisper-base/snapshots/267437e6e7a1d6cb0846340910c19302aa1fd1fb/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-base",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_length": 448

[' 피라미드 사운드와 광선쇼는 이 지역에서 어린이들의 흥미를 가장 많이 끌어드리는 것들 중 하나입니다.', ' 더 전통적인 교회는 부활절주간에 토요일 밤에 부활 처리하기를 열며 여기에서 신두들이 종종 자정의 주의 부활을 축하합니다.', ' 산을 둘러싸고 있는 맑고 아름다운 하늘 바뀌는 보이지 않는다. 이 동구란에서는 바깥 세상이 보이지도 들리지도 않는다.', ' 장면들이 피라미드들 위에 비쳤고 다른 피라미드들에는 불이 밝혀졌다.', ' 뇌 병리와 행동 사이에 상관 관계가 과학자들의 영구를 독부습니다.'] ['피라미드 사운드와 광선 쇼는 이 지역에서 어린이들의 흥미를 가장 많이 끌어들이는 것들 중 하나입니다.', '더 전통적인 교회는 부활절 주간의 토요일 밤에 부활 철야제를 열며, 여기에서 신도들이 종종 자정에 주의 부활을 축하합니다.', '산을 둘러싸고 있는 맑고 아름다운 하늘밖에는 보이지 않는다. 이 동굴 안에서는 바깥세상이 보이지도, 들리지도 않는다.', '장면들이 피라미드들 위에 비쳤고 다른 피라미드들에는 불이 밝혀졌다.', '뇌 병리와 행동 사이의 상관관계가 과학자들의 연구를 돕습니다.']
***** test2 metrics *****
  test_loss               =     1.2711
  test_runtime            = 0:01:48.37
  test_samples_per_second =      3.525
  test_steps_per_second   =      0.111
  test_wer                =    32.8191
  test_wer_norm           =    29.1801
Evaluation for he_il starts...


loading configuration file config.json from cache at /home/sivan/.cache/huggingface/hub/models--openai--whisper-base/snapshots/267437e6e7a1d6cb0846340910c19302aa1fd1fb/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-base",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_length": 448

[' בדרך כלל תמיד שום עם את כלות התיירים ועמוקרים סיבור הכל והאור הוא ממש כמו סיבורים', ' זודר חשובה להבדיד במספר פעלים והצמין.', ' הגובה מנימה למטחת לגשר הוא 50% הבנייה ה-20% באוגוס 2.1, וואונפתח לתנוע רק במרצ 2.2.0.', ' אז מה רעדיו כנדלוואל לבטו, שפן בו גבאדשן כמו חן לקסיג הציג את.', ' אבל הרבה דברים את זה ציפורים מדליים כמו דינוזווי'] ['בדרך כלל תמיד שומעים את קולות התיירים והמוכרים. סיפור הקול והאור הוא ממש כמו ספר סיפורים.', 'זו דרך חשובה להבדיל בין מספר פעלים ועצמים.', 'הגובה המינימלי מתחת לגשר הוא 15 מטר. הבנייה הסתיימה באוגוסט 2011, והוא נפתח לתנועה רק במרץ 2017.', "כמו כן, לאקה סינג הציג את chhappan bhog bhajan. הזמר ראג'ו קנדלוואל ליווה אותו.", 'אבל הרבה דברים אצל ציפורים עדיין נראים כמו דינוזאורים.']
***** test2 metrics *****
  test_loss               =     1.7893
  test_runtime            = 0:05:59.23
  test_samples_per_second =      2.205
  test_steps_per_second   =       0.07
  test_wer                =    72.1853
  test_wer_norm           =    69.7215
Evaluation for te_in

loading configuration file config.json from cache at /home/sivan/.cache/huggingface/hub/models--openai--whisper-base/snapshots/267437e6e7a1d6cb0846340910c19302aa1fd1fb/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-base",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_length": 448

[' 10-12 Ṭhbhag.', ' hatharena warisankhe enthe ekku antesain pita sqeed lojerin antakriyelant churranan ke andri ki sadhe padalidhu.', ' Khevalam konnisvalpa beidaalutappa, kaalpanika brundaalukoda, sampradhaya brundaaluvanti naipunyam kaligumantai.', ' Aerosmith wari paryatana lo miglinal sangi te kacheri leno dhadhu chaisindi.', ' i Ṛṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭṭ�'] ['రెండవ సెట్\u200cలో Del Potroకు ఆధిక్యం లభించినా కూడా, ఈ సెట్\u200cలో కూడా 6-6కి చేరుకున్న తర్వాత టై బ్రేక్ అనివార్యం అయ్యింది.', "హాజరైన వారి సంఖ్య ఎంత ఎక్కువంటే, St. Peter's స్క్వేర్\u200cలో జరిగిన అంత్యక్రియలను చూడడానికి అందరికీ సాధ్యపడలేదు.", 'కేవలం కొన్ని స్వల్ప బేధాలు తప్ప, కాల్పనిక బృందాలు కూడా సాంప్రదాయ బృందాల వంటి నైపుణ్యం కలిగి ఉంటాయి.', 'ఏరోస్మిత్ వారి పర్యటనలో మిగిలిన సంగీత కచేరీలను రద్దు చేసింది.', 'ఈ ప్రమాదం ఎత్తుగా ఉన్న పర్వత భూభాగంలో జరిగిందని, ఇంకా శత్రువులు రాజేసిన అగ్ని ఫలితంగా జరిగిందని నమ్ముతారు.']
***** test2 metrics ***

loading configuration file config.json from cache at /home/sivan/.cache/huggingface/hub/models--openai--whisper-base/snapshots/267437e6e7a1d6cb0846340910c19302aa1fd1fb/config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-base",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "max_length": 448

['1940年8月15日蒙軍公路法國南部這次進攻被稱為龍旗冰晴洞', '該全島位於南極半島一杯120公里處最大的島嶼是橋至國王島這裡是凡青春的定居點', '卷心菜之的颜色将根据化学物质的酸检性发生变化', '海地正與民主研究所引用的獨立研究表明是尼波爾的聯合國維和部隊在不知情的情況下將這種疾病帶到了海地', '老虎是全能運動員,他會犯罰,雖然犯罰能力不是很強,游泳,跳遠,拉力能達到人類大力勢的五倍'] ['1940 年 8 月 15 日，盟军攻入法国南部，这次进攻被称为“龙骑兵行动”。', '该群岛位于南极半岛以北 120 公里处。最大的岛屿是乔治国王岛，这里是“繁星村（Villa Las Estrellas）”的定居点。', '卷心菜汁的颜色将根据化学物质的酸碱性发生变化。', '海地正义与民主研究所 (Haitian Institute for Justice and Democracy) 引用的独立研究表明，是尼泊尔的联合国维和部队在不知情的情况下将这种疾病带到了海地。', '老虎是全能运动员，它会攀爬（虽然攀爬能力不是很强）、游泳、跳远，拉力能达到人类大力士的五倍。']
***** test2 metrics *****
  test_loss               =     1.7503
  test_runtime            = 0:05:04.18
  test_samples_per_second =      3.107
  test_steps_per_second   =      0.099
  test_wer                =   103.5536
  test_wer_norm           =    35.1913
CPU times: user 17min 10s, sys: 3min 30s, total: 20min 41s
Wall time: 18min 36s


In [None]:
%%time
# compare with vanilla eval method
from jiwer import wer

for lang in langs:
    print(f"Evaluation for {lang} starts...")

    # load processor
    # processor = WhisperProcessor.from_pretrained("openai/whisper-base")

    # load pretrained model
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to("cuda")
    model.config.forced_decoder_ids = processors[lang].get_decoder_prompt_ids(language=codes[lang], task="transcribe")
    result = tests[lang].map(utils.map_to_pred, 
                             fn_kwargs={"model": model, "processor": processors[lang], "lang": codes[lang]},
                             batched=True, batch_size=1)
    print("Results from pretrain Whisper base:")
    print("WER_NORM:", wer(result["transcription"], result["prediction"]))
    print("WER:", wer(result["raw_transcription"], result["raw_prediction"]))

    # load finetuned model
    model_tune = WhisperForConditionalGeneration.from_pretrained(f"/home/sivan/whisper_base_fl_{lang}/checkpoint-{best[lang]}").to("cuda")

    model_tune.config.forced_decoder_ids = processors[lang].get_decoder_prompt_ids(language=codes[lang], task="transcribe")
    result_tune = tests[lang].map(utils.map_to_pred, 
                                  fn_kwargs={"model": model_tune, "processor": processors[lang], "lang": codes[lang]},
                                  batched=True, batch_size=1)
    print("Results from finetune Whisper base:")
    print("WER_NORM:", wer(result_tune["transcription"], result_tune["prediction"]))
    print("WER:", wer(result_tune["raw_transcription"], result_tune["raw_prediction"]))

Evaluation for ko_kr starts...


  0%|          | 0/382 [00:00<?, ?ba/s]

Results from pretrain Whisper base:
WER_NORM: 0.3330376153300213
WER: 0.3567375886524823


  0%|          | 0/382 [00:00<?, ?ba/s]

Results from finetune Whisper base:
WER_NORM: 0.2940028388928318
WER: 0.3129432624113475
Evaluation for he_il starts...


  0%|          | 0/792 [00:00<?, ?ba/s]

Results from pretrain Whisper base:
WER_NORM: 0.7137796147317145
WER: 0.7522901891252955


  0%|          | 0/792 [00:00<?, ?ba/s]

Results from finetune Whisper base:
WER_NORM: 0.5929588899549783
WER: 0.619533096926714
Evaluation for te_in starts...


  0%|          | 0/472 [00:00<?, ?ba/s]

Results from pretrain Whisper base:
WER_NORM: 2.5908219353189312
WER: 2.474859622256253


  0%|          | 0/472 [00:00<?, ?ba/s]

Results from finetune Whisper base:
WER_NORM: 2.9284162086156207
WER: 0.6856814701378254
Evaluation for cmn_hans_cn starts...


  0%|          | 0/945 [00:00<?, ?ba/s]

In [8]:
fleurs_te = preprocess["te_in"]
collator = collators["te_in"]
processor = processors["te_in"]
training_args = Seq2SeqTrainingArguments(
    output_dir="/home/sivan/whisper_base_fl_te_in",
    per_device_train_batch_size=16,  # originally 16
    gradient_accumulation_steps=1,  # originally 1, increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=False,  # original True
    group_by_length=False,  # set true if length is specified in dataset
    evaluation_strategy="steps",
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,  # set true to push trained model to HF
    disable_tqdm=False,  # set false to see progress bar
)

# load pretrained model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to("cuda")
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

# specify metric for each language
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Telugu", task="transcribe")
compute_metrics = utils.metrics("te", tokenizer)


# set trainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=fleurs_te["train"],
    eval_dataset=fleurs_te["validation"],
    data_collator=collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

print(f"Training starts...")
trainer.train(resume_from_checkpoint = True) # comment to eval original whisper base

predict_results = trainer.predict(fleurs_te["test"], metric_key_prefix="test")
metrics = predict_results.metrics
trainer.log_metrics("test3", metrics)
trainer.save_metrics("test3", metrics)

max_steps is given, it will override any value given in num_train_epochs
Loading model from /home/sivan/whisper_base_fl_te_in/checkpoint-4000.


Training starts...


The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: norm, labels_length, feature_length, input_length. If norm, labels_length, feature_length, input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2302
  Num Epochs = 28
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4000
  Number of trainable parameters = 72593920
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 27
  Continuing training from global step 4000
  Will skip the first 27 epochs then the first 112 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the traini

  0%|          | 0/112 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /home/sivan/whisper_base_fl_te_in/checkpoint-1000 (score: 71.45242070116862).
The following columns in the test set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: norm, labels_length, feature_length, input_length. If norm, labels_length, feature_length, input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 472
  Batch size = 32


['అనెండవ సెట్ లో డెల్ పోట్రాకు ఆధిక్యం లభించిన కూడా ఈ సెట్ లో కూడా ఆరు ఆర్నించి ఆరికీ చేరుకున్న తర్వాత డైబ్రే గణివారియం అ�', 'హాథరైనవారిసంకే ఎంత ఎక్కువంటే సెన్ పీటస్ క్వేడ్ లు జరీ నంత క్రీలన్ చూగడానికి అందరికి సాధ్యపడలేదు.', 'కేవలం కొన్ని స్వల్ప భేదాలు తప్ప కాల్పనిక బృందాలు కూడా సంప్రదాయ బృందాలు వంటి నైకున్యం కలిగి ఉంటాయి.', 'ఏరోస్మిద్వారి పర్యటనలో మిగిలిన సంగీత కచేరీలను రద్దుచేసింది.', 'ఈ ప్రమాదం ఎత్తుగా ఉన్న పర్వత భూభాగంలో జరిగిన్నని ఇంకా శెత్రులు రాజేసిన అగ్ని ఫలితంగా జరిగిన్నని నమ్ముతారు.'] ['రెండవ సెట్\u200cలో Del Potroకు ఆధిక్యం లభించినా కూడా, ఈ సెట్\u200cలో కూడా 6-6కి చేరుకున్న తర్వాత టై బ్రేక్ అనివార్యం అయ్యింది.', "హాజరైన వారి సంఖ్య ఎంత ఎక్కువంటే, St. Peter's స్క్వేర్\u200cలో జరిగిన అంత్యక్రియలను చూడడానికి అందరికీ సాధ్యపడలేదు.", 'కేవలం కొన్ని స్వల్ప బేధాలు తప్ప, కాల్పనిక బృందాలు కూడా సాంప్రదాయ బృందాల వంటి నైపుణ్యం కలిగి ఉంటాయి.', 'ఏరోస్మిత్ వారి పర్యటనలో మిగిలిన సంగీత కచేరీలను రద్దు చేసింది.', 'ఈ ప్రమాదం ఎత్తుగా ఉన్న పర్వత భూభాగంలో జరిగిందని, ఇంకా శత్రువులు రాజేసిన అగ్ని ఫలితం