# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [7]:
!bash prepare.sh

Writing to /home/zzf/.config/pip/pip.conf
Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
[31mERROR: Could not find a version that satisfies the requirement torchvision==0.8.2+cu110[0m
[31mERROR: No matching distribution found for torchvision==0.8.2+cu110[0m
Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
prepare.sh: line 8: mkdirs: command not found
fatal: destination path 'whisper-base.en' already exists and is not an empty directory.
rm: cannot remove 'flax_model.msgpack': No such file or directory
--2024-04-19 04:27:25--  https://hf-mirror.com/openai/whisper-base.en/resolve/main/flax_model.msgpack?download=true
Resolving hf-mirror.com (hf-mirror.com)... 160.16.199.204, 133.242.169.68, 153.121.57.40
Connecting to hf-mirror.com (hf-mirror.com)|160.16.199.204|:443... connected.
HT

# Loading the LibriSpeech dataset

The following will load the test-clean split of the LibriSpeech corpus using torchaudio.

In [8]:
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [40]:
class LibriSpeech(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
    It will drop the last few seconds of a very small portion of the utterances.
    """
    def __init__(self, split="test-clean", device=DEVICE):
        self.dataset = torchaudio.datasets.LIBRISPEECH(
            root="/home/zzf/playground/corpus/LibriSpeech",
            url=split,
            folder_in_archive="/home/zzf/playground/corpus/LibriSpeech",
            download=False,
        )
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        audio, sample_rate, text, _, _, _ = self.dataset[item]
        assert sample_rate == 16000
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)
        
        return (mel, text)

In [41]:
dataset = LibriSpeech("test-clean")
loader = torch.utils.data.DataLoader(dataset, batch_size=16)
dataset.__len__()

2620

# Running inference on the dataset using a base Whisper model

The following will take a few minutes to transcribe all utterances in the dataset.

In [46]:
model = whisper.load_model("medium.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is English-only and has 762,320,896 parameters.


In [47]:
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True)

In [48]:
hypotheses = []
references = []

for mels, texts in tqdm(loader):
    results = model.decode(mels, options, fp16=False)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)

  0%|          | 0/164 [00:00<?, ?it/s]

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s]


In [49]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...
1,"Stuff it into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND
4,"Number ten, Fresh Nellie is waiting on you. Go...",NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...
...,...,...
2615,"O, to shoot my soul's full meaning into future...",OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...
2616,"Then I, long tried by natural ills, Receiv'd t...",THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...
2617,"I love thee freely, as men strive for right, I...",I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...
2618,"I love thee with the passion put to use, And m...",I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...


# Calculating the word error rate

Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.

In [50]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [51]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...,he hoped there would be stew for dinner turnip...,he hoped there would be stew for dinner turnip...
1,"Stuff it into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM,stuff it into you his belly counseled him,stuff it into you his belly counseled him
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...,after early nightfall the yellow lamps would l...,after early nightfall the yellow lamps would l...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND,hello bertie any good in your mind,hello bertie any good in your mind
4,"Number ten, Fresh Nellie is waiting on you. Go...",NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...,number 10 fresh nellie is waiting on you good ...,number 10 fresh nelly is waiting on you good n...
...,...,...,...,...
2615,"O, to shoot my soul's full meaning into future...",OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...,0 to shoot my soul is full meaning into future...,0 to shoot my soul is full meaning into future...
2616,"Then I, long tried by natural ills, Receiv'd t...",THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...,then i long tried by natural ills receiv would...,then i long tried by natural ills received the...
2617,"I love thee freely, as men strive for right, I...",I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...,i love thee freely as men strive for right i l...,i love thee freely as men strive for right i l...
2618,"I love thee with the passion put to use, And m...",I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...,i love thee with the passion put to use and my...,i love thee with the passion put to use in my ...


In [52]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 3.02 %
