# Whisper Pretrained Evaluation on LibriSpeech & Fleurs

Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.

Codes below adpated from [github repo](https://github.com/openai/whisper)

## Requirement
Python 3.9.9 and PyTorch 1.10.1

In [1]:
import os
print (os.environ['CONDA_DEFAULT_ENV'])

base


In [2]:
#!pip install git+https://github.com/openai/whisper.git 

## Multilingual model

### Testing on Telugu

In [14]:
import io
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import urllib
import tarfile
import whisper
import jiwer
import torchaudio

from scipy.io import wavfile
from tqdm.notebook import tqdm


pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 1000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cuda


In [4]:
# download Telugu from Fleurs
import ipywidgets as widgets

languages = {"af_za": "Afrikaans", "am_et": "Amharic", "ar_eg": "Arabic", "as_in": "Assamese", "az_az": "Azerbaijani", "be_by": "Belarusian", "bg_bg": "Bulgarian", "bn_in": "Bengali", "bs_ba": "Bosnian", "ca_es": "Catalan", "cmn_hans_cn": "Chinese", "cs_cz": "Czech", "cy_gb": "Welsh", "da_dk": "Danish", "de_de": "German", "el_gr": "Greek", "en_us": "English", "es_419": "Spanish", "et_ee": "Estonian", "fa_ir": "Persian", "fi_fi": "Finnish", "fil_ph": "Tagalog", "fr_fr": "French", "gl_es": "Galician", "gu_in": "Gujarati", "ha_ng": "Hausa", "he_il": "Hebrew", "hi_in": "Hindi", "hr_hr": "Croatian", "hu_hu": "Hungarian", "hy_am": "Armenian", "id_id": "Indonesian", "is_is": "Icelandic", "it_it": "Italian", "ja_jp": "Japanese", "jv_id": "Javanese", "ka_ge": "Georgian", "kk_kz": "Kazakh", "km_kh": "Khmer", "kn_in": "Kannada", "ko_kr": "Korean", "lb_lu": "Luxembourgish", "ln_cd": "Lingala", "lo_la": "Lao", "lt_lt": "Lithuanian", "lv_lv": "Latvian", "mi_nz": "Maori", "mk_mk": "Macedonian", "ml_in": "Malayalam", "mn_mn": "Mongolian", "mr_in": "Marathi", "ms_my": "Malay", "mt_mt": "Maltese", "my_mm": "Myanmar", "nb_no": "Norwegian", "ne_np": "Nepali", "nl_nl": "Dutch", "oc_fr": "Occitan", "pa_in": "Punjabi", "pl_pl": "Polish", "ps_af": "Pashto", "pt_br": "Portuguese", "ro_ro": "Romanian", "ru_ru": "Russian", "sd_in": "Sindhi", "sk_sk": "Slovak", "sl_si": "Slovenian", "sn_zw": "Shona", "so_so": "Somali", "sr_rs": "Serbian", "sv_se": "Swedish", "sw_ke": "Swahili", "ta_in": "Tamil", "te_in": "Telugu", "tg_tj": "Tajik", "th_th": "Thai", "tr_tr": "Turkish", "uk_ua": "Ukrainian", "ur_pk": "Urdu", "uz_uz": "Uzbek", "vi_vn": "Vietnamese", "yo_ng": "Yoruba"}
selection = widgets.Dropdown(
    options=[("Select language", None), ("----------", None)] + sorted([(f"{v} ({k})", k) for k, v in languages.items()]),
    value="te_in",
    description='Language:',
    disabled=False,
)

selection

Dropdown(description='Language:', index=75, options=(('Select language', None), ('----------', None), ('Afrika…

In [5]:
lang = selection.value
language = languages[lang]

assert lang is not None, "Please select a language"
print(f"Selected language: {language} ({lang})")

Selected language: Telugu (te_in)


In [6]:
class Fleurs(torch.utils.data.Dataset):
    """
    A simple class to wrap Fleurs and subsample a portion of the dataset as needed.
    """
    def __init__(self, lang, split="test", subsample_rate=1, device=DEVICE):
        url = f"https://storage.googleapis.com/xtreme_translations/FLEURS102/{lang}.tar.gz"
        tar_path = os.path.expanduser(f"~/.cache/fleurs/{lang}.tgz")
        os.makedirs(os.path.dirname(tar_path), exist_ok=True)

        if not os.path.exists(tar_path):
            with urllib.request.urlopen(url) as source, open(tar_path, "wb") as output:
                with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
                    while True:
                        buffer = source.read(8192)
                        if not buffer:
                            break

                        output.write(buffer)
                        loop.update(len(buffer))

        labels = {}
        all_audio = {}
        with tarfile.open(tar_path, "r:gz") as tar:
            for member in tar.getmembers():
                name = member.name
                if name.endswith(f"{split}.tsv"):
                    labels = pd.read_table(tar.extractfile(member), names=("id", "file_name", "raw_transcription", "transcription", "_", "num_samples", "gender"))

                if f"/{split}/" in name and name.endswith(".wav"):
                    audio_bytes = tar.extractfile(member).read()
                    all_audio[os.path.basename(name)] = wavfile.read(io.BytesIO(audio_bytes))[1]
                    

        self.labels = labels.to_dict("records")[::subsample_rate]
        self.all_audio = all_audio
        self.device = device

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        record = self.labels[item]
        audio = torch.from_numpy(self.all_audio[record["file_name"]].copy())
        text = record["transcription"]
        
        return (audio, text)
    

In [7]:
# downloading ~2G for ~60 seconds
dataset = Fleurs(lang, subsample_rate=2)  # subsample 2% of the dataset for a quick demo

  0%|                                              | 0.00/1.81G [00:00<?, ?iB/s]

In [8]:
model = whisper.load_model("base")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is multilingual and has 71,825,920 parameters.


In [9]:
options = dict(language=language, beam_size=5, best_of=5, fp16 = False)
transcribe_options = dict(task="transcribe", **options)
#translate_options = dict(task="translate", **options)

In [11]:
%%time

references = []
transcriptions = []
#translations = []

for audio, text in tqdm(dataset):
    transcription = model.transcribe(audio, **transcribe_options)["text"]
    #translation = model.transcribe(audio, **translate_options)["text"]
    
    transcriptions.append(transcription)
    #translations.append(translation)
    references.append(text)

  0%|          | 0/236 [00:00<?, ?it/s]

CPU times: user 40min 32s, sys: 4min 26s, total: 44min 58s
Wall time: 44min 49s


In [16]:
data = pd.DataFrame(dict(reference=references, transcription=transcriptions, translation=translations))
data

Unnamed: 0,reference,transcription,translation
0,అదే విధంగా షెంగెన్ వీసా ఉంటే మీరు షెంగెన్లో ఉండే ప్రతి దేశానికి వేరుగా వీసాను అప్లై చేయనవసరం లేదు దీని వలన సమయం డబ్బు మరియు కాగితాలను ఆదా చేయవచ్చు,azhévidha Angaro shen ken? vis sa andti? shen ken? botet chayar çık axate dhi sian kiviresa vis sa yalla amen apai chaayaram le et spl chw dhinava écala samayem dubbu mite ke說k hake tara ba n 되고 jaywworm faço baag aa a t baag nib k pong jag,We have not applied any vise and pristine vise in the same way. We have not applied any vise and pristine vise. We have not applied any vise and pristine vise in the same way.
1,"ఏది ఏమయినప్పటికీ రాత్రికి రాత్రే పథకాన్ని అమలు చేసి సోవియట్ యూనియన్ రెడ్ ఆర్మీకి చెందిన 800,000 మంది సైనికులు రిగా శాంతి ఒప్పందం సోవియట్-పోలిష్ నాన్-అగ్రెషన్ ఒప్పందం మరియు ఇతర అంతర్జాతీయ ఒప్పందాలు ద్వైపాక్షిక మరియు బహుపాక్షిక ఒప్పందాలను ఉల్లంఘిస్తూ పోలాండ్ యొక్క తూర్పు ప్రాంతాలపై దాడి చేసిన తరువాత బెలరషియన్ మరియు ఉక్రేనియన్ ఫ్రంట్లు సృష్టించారు",اידی ایم این اپلگی راتریکャ estate,"Last night, two of seamen in a whitish survey was released in 2 months Theettes of pilots have been stating that seaports don't hold up as much contact as access to the Mediterranean and Europe that took him back. Shien Maru ukrainian frontal Supreme Charı"
2,ఉత్తరాన ఈ ప్రాంతం సాహెల్ దక్షిణాన మరియు పశ్చిమాన అట్లాంటిక్ మహాసముద్రం ద్వారా సరిహద్దుగా ఉంది,"異ர் தெடுத்துபம், ரா singers tellementுடனே ஒரு அதிகம் சோரா や ஓவில் நடத்துபெனைக் கwarzqu information page","A year after the heart may be blessed, the Metropolitan Customer of Professor, Dr. Chandramarichi the"
3,కొన్ని షిప్ లు బ్రోచర్ లలో బెర్లిన్ జర్మనీని చూపించాయి మీరు పైన ఉన్న మ్యాప్ ని చూసినట్లయితే బెర్లిన్ సముద్రం సమీపంలో ఎక్కడా లేదు మరియు నగర సందర్శన షిప్ ధరలో చేర్చబడలేదు,کن்னு சிப்பு போஞ்சர்லலோ பெடில் செல்வனின் சுப்பின்சாயி மீரு பயனான் உன்னமாகப் பெண்ணு சூசினைத் தலைத்து பெடில் சமுத்துரம் சமிப்பம்லும் எக்கடாலேது மரியும் நகர சந்தர் சிப்பு பெடில் செய்த்து படில் ஏது,"In Im新聞 running in some progress, Bedlean注意s are left. On the 2nd day of episode 1, your old man was watching, Edu was not here with a Bedelin Samudhra maim. We have to change some steps on the shape into a fairly interesting shaped picture by hand."
4,మేము మా ఇళ్ళు మొక్కలతోనే నిర్మిస్తాము మరియు మొక్కల నుండే బట్టలు తయారు చేస్తాము మేము తినే ఆహారం చాలా వరకు మొక్కలే మొక్కలు లేకుండా జంతువులు మనుగడ సాగించలేవు,مِمُ مَائِمُ مَائِلُ مَقْلَتَونِ نِرْمِسْتَمُ مَرِيُمُ مَقْلَنٌ دِبَتَلُ تَعَرْجَسْتَمُ مِِمُ تِنِ اَحَارَمْ شَالَى وَرْكُ مَقْلَهِ مَقْلَهِ مَقْلُلُ لَيْكُنْدَ جَنْتُولُ مَلُغَدَ سَاجِنْتَ لَيْوُ,"Then, you can prepare any flour or рис for cooking chat with them on fire."
...,...,...,...
231,అధికార పార్టీ సౌత్ వెస్ట్ ఆఫ్రికా పీపుల్స్ ఆర్గనైజేషన్ ఎస్డబల్యూఏపిఓ కూడా పార్లమెంటు ఎన్నికలలో మెజారిటీని నిలుపుకుంది,"Bhadikarika Party, South West, Africa, Peoples Organization, NWA, APO, KUDA, Padlaminti, Ennikalala, Mejaratini, Nilpukundi.","Bhadikarika Party, South West, Africa, Peoples Organization, NWA, APO, KUDA, Parliament, Indicolors, Mejaratini, Neelpukundi."
232,అంతేకాకుండా ప్రతి రాజవంశానికి చెందిన ప్రాంతాలు కొన్ని సంవత్సరాలుగా విభజించబడి ఉన్నాయి ఈ కాలాలలో బాగా తెలిసినది హాన్ మరియు జిన్ రాజవంశం మధ్య 60 సంవత్సరాల పాటు జరుగుతున్న మూడు రాజ్యాల శకము,"ان்டைக்காக் கொண்டா, பற்றி ராஜாவம்ஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞஞ","First of all, the people of Pratiraja Vamsha, who were born in Pranthal, are the people of Vibhajn and Chabadi. These are the people of Kalala, the people of Baga, Telisi, Han, Mariyu, Jinn, the people of Raja Vamsha. The people of Pratiraja Vamsha are the people of Pranthal, the people of Pranthal, Raja Vamsha."
233,రోలాండో మెన్డోజా తన m16 రైఫిల్ను పర్యాటకులపై పేల్చాడు,Prolando Mendoza Tana M16 rifle 16 rifle,inールando so the m16 rifle fell very hot
234,గోమా చుట్టూ భ్రమించడానికి బోడా-బోడాను మోటార్ సైకిల్ టాక్సీ ఉపయోగించవచ్చు. సాధారణ స్థానిక వెల ~500 కాంగో ఫ్రాంక్స్ షార్ట్ రైడ్ కొరకు,Goma Chuttu Bramichadani ki Boda Boda Anu Motorcycle Taxi upayu ginsha vachu Saadara Nastanika Villa aidovandala Kangu Frunks Short Riding Kwaraku,"Motorcycle Taxes swing freely due to Bhuramma tips for Bolt, Board No place for World War II, neither Congo, France short-triage and still racists."


In [18]:
# calculate WER

te_wer = jiwer.wer(list(data["reference"]), list(data["transcription"]))

print(f"WER: {te_wer * 100:.2f} %")

WER: 127.47 %


### Test on Hebrew

In [19]:
# download Hebrew from Fleurs
import ipywidgets as widgets

languages = {"af_za": "Afrikaans", "am_et": "Amharic", "ar_eg": "Arabic", "as_in": "Assamese", "az_az": "Azerbaijani", "be_by": "Belarusian", "bg_bg": "Bulgarian", "bn_in": "Bengali", "bs_ba": "Bosnian", "ca_es": "Catalan", "cmn_hans_cn": "Chinese", "cs_cz": "Czech", "cy_gb": "Welsh", "da_dk": "Danish", "de_de": "German", "el_gr": "Greek", "en_us": "English", "es_419": "Spanish", "et_ee": "Estonian", "fa_ir": "Persian", "fi_fi": "Finnish", "fil_ph": "Tagalog", "fr_fr": "French", "gl_es": "Galician", "gu_in": "Gujarati", "ha_ng": "Hausa", "he_il": "Hebrew", "hi_in": "Hindi", "hr_hr": "Croatian", "hu_hu": "Hungarian", "hy_am": "Armenian", "id_id": "Indonesian", "is_is": "Icelandic", "it_it": "Italian", "ja_jp": "Japanese", "jv_id": "Javanese", "ka_ge": "Georgian", "kk_kz": "Kazakh", "km_kh": "Khmer", "kn_in": "Kannada", "ko_kr": "Korean", "lb_lu": "Luxembourgish", "ln_cd": "Lingala", "lo_la": "Lao", "lt_lt": "Lithuanian", "lv_lv": "Latvian", "mi_nz": "Maori", "mk_mk": "Macedonian", "ml_in": "Malayalam", "mn_mn": "Mongolian", "mr_in": "Marathi", "ms_my": "Malay", "mt_mt": "Maltese", "my_mm": "Myanmar", "nb_no": "Norwegian", "ne_np": "Nepali", "nl_nl": "Dutch", "oc_fr": "Occitan", "pa_in": "Punjabi", "pl_pl": "Polish", "ps_af": "Pashto", "pt_br": "Portuguese", "ro_ro": "Romanian", "ru_ru": "Russian", "sd_in": "Sindhi", "sk_sk": "Slovak", "sl_si": "Slovenian", "sn_zw": "Shona", "so_so": "Somali", "sr_rs": "Serbian", "sv_se": "Swedish", "sw_ke": "Swahili", "ta_in": "Tamil", "te_in": "Telugu", "tg_tj": "Tajik", "th_th": "Thai", "tr_tr": "Turkish", "uk_ua": "Ukrainian", "ur_pk": "Urdu", "uz_uz": "Uzbek", "vi_vn": "Vietnamese", "yo_ng": "Yoruba"}
selection = widgets.Dropdown(
    options=[("Select language", None), ("----------", None)] + sorted([(f"{v} ({k})", k) for k, v in languages.items()]),
    value="he_il",
    description='Language:',
    disabled=False,
)

selection

Dropdown(description='Language:', index=28, options=(('Select language', None), ('----------', None), ('Afrika…

In [20]:
lang = selection.value
language = languages[lang]

assert lang is not None, "Please select a language"
print(f"Selected language: {language} ({lang})")

Selected language: Hebrew (he_il)


In [24]:
# downloading ~2G for ~60 seconds
dataset = Fleurs(lang, subsample_rate=2)  # subsample 2% of the dataset for a quick demo

  0%|                                              | 0.00/2.22G [00:00<?, ?iB/s]

In [21]:
options = dict(language=language, beam_size=5, best_of=5, fp16 = False)
transcribe_options = dict(task="transcribe", **options)
#translate_options = dict(task="translate", **options)`

In [25]:
%%time

references = []
transcriptions = []
#translations = []

for audio, text in tqdm(dataset):
    transcription = model.transcribe(audio, **transcribe_options)["text"]
    #translation = model.transcribe(audio, **translate_options)["text"]
    
    transcriptions.append(transcription)
    #translations.append(translation)
    references.append(text)

  0%|          | 0/396 [00:00<?, ?it/s]

CPU times: user 5min 36s, sys: 46.5 s, total: 6min 22s
Wall time: 6min 15s


In [27]:
data = pd.DataFrame(dict(reference=references, transcription=transcriptions))
data

Unnamed: 0,reference,transcription
0,מרבית הפרשנויות לדטרמיניזם טכנולוגי חולקות שני רעיונות כלליים שפיתוח הטכנולוגיה מתקדם בדרך שלא מושפעת תרבותית או פוליטית במידה מרובה וכי לטכנולוגיה יש השפעות על חברות שהן מובנות ולא מותנות חברתית,"מרבית הפרשונות לדר את המיזם טכנולוגי חולקות שני רעל כלכליים, כלליים, שבטוח את טכנולוגי מתקדם בדרך שלא מושפה תרבותית או פוליטנו מידה מעובה, בקין הטכנולוגי יש השפעות על חברות שיהיה נובנות ולא מותנות חברתית."
1,מאוחר ביום ראשון נשיא ארה ב דונלד טראמפ הודיע שהכוחות האמריקאים יעזבו את סוריה בהצהרה שהעבירה דוברת הבית הלבן,"מיוחר ביום ראשון, נסי ארב דולנטרם, אותי שעקוחות האמריקאים יעזבו את סוריה ביצרה שבעירה דופרת הבית הלבן."
2,המכון לצדק ולדמוקרטיה של האיטי אזכר מחקרים עצמאיים שמראים כי גדוד שמירת השלום הנפאלי של האו ם הביא ללא ידיעתו את המחלה להאיטי,המכון לצדק ולדמוקרטי של הייתי הזכר מחכרים עצמאים שברים כי גדוד שמיר את השולם על הפלי של האום ולהלויד יתועת המחלה ל הייתי
3,גיהוץ בגדים לחים יכול לעזור להם להתייבש במלונות רבים יש מגהץ וקרש גיהוץ זמינים להשאלה גם אם הם לא נמצאים בחדר,"גיוץ בגדים לכים יכול לעזור למליטבי לתייבש ממלון תרבי משפגי עצוק יש גוץ מיניים להשאלה, אדם ממלון למצרים בחידה"
4,המערה שוכנת בפסגת אחד ההרים מצפון למכה והיא מבודדת לחלוטין מכל שאר העולם,המערה שוכנת בפסגת אחד הארים מצפון למקה והיא מפודדת לחלוטין מכל שערה עולם.
...,...,...
391,רבים אינם רואים בהם דינוזאורים מאחר שיש להם נוצות והם יכולים לעוף,"רבים אינם רואים בהם דינוזרים, מי אחר שיש להם נוצד והם יכולים לעוף."
392,מיטשל גורלי האוסטרלי סיים במקום האחד עשר בסופר-ג'י לגברים המתחרה הצ'כי אולדריך ילינק סיים במקום השישה עשר במקצה הסופר-ג'י בישיבה לגברים,"מיצל גורלי האוסטרלי סיימב המקום האחד, עשר בסופר אדגי לגברים המתחרת שחי, עול דרך יליניק, סיימב המקום השישה עשר במקצי הסופר אדגי בישיבה לגברים."
393,מיטשל גורלי האוסטרלי סיים במקום האחד עשר בסופר-ג'י לגברים המתחרה הצ'כי אולדריך ילינק סיים במקום השישה עשר במקצה הסופר-ג'י בישיבה לגברים,מצל גורליה הוסטרליסים המקומה אחד השעה בסופר גיל הגברים מתחרעת שכי עוד ריח ילינק שיהם במקומה שישה שישה שער בני קטע סופר גיל בשיבה לגברים
394,היא קשורה אך בדרך כלל אינה מערבת סקי בסגנון אלפיני או טיפוס הרים האחרונים מבוצעים בשטחים תלולים והם דורשים מגפיים ומגלשיים קשיחים הרבה יותר,"כשורך בדרך כלל לנמרי וצקי בזיגנואלפיניות, טיפוסריים, החוני ובצימש טקרנטלויים וממדור שמגפה ממוגלשיים שלכים הרבה יותר"


In [28]:
# calculate WER

he_wer = jiwer.wer(list(data["reference"]), list(data["transcription"]))

print(f"WER: {he_wer * 100:.2f} %")

WER: 69.20 %
