# Whisper Model
modified notebook by Sivan Ding (sd5397)

Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.

Codes below adpated from [github repo](https://github.com/openai/whisper)

## Requirement
Python 3.9.9 and PyTorch 1.10.1

In [1]:
!pip install colorednoise ffmpeg torchaudio jiwer

Collecting torchaudio
  Downloading torchaudio-0.12.1-cp37-cp37m-manylinux1_x86_64.whl (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m74.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting torch==1.12.1
  Downloading torch-1.12.1-cp37-cp37m-manylinux1_x86_64.whl (776.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: torch, torchaudio
  Attempting uninstall: torch
    Found existing installation: torch 1.10.0
    Uninstalling torch-1.10.0:
      Successfully uninstalled torch-1.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.11.1+cu111 requires torch==1.10.0, but you have torch 1.12.1 which is incompatible.
fastai 2.5.3 requires torch<1.11,>=1.7.0, but you have torch 1

In [2]:
# Julia's code

import librosa
import numpy as np
import colorednoise as cn


def add_noise(s, sample_rate=16000, noise_percentage_factor = .01, noise_type='white'):
    # s: audio input (mono)
    # sample rate: sample rate of s
    # noise_percentage_factor, percentage scale of added noise added
    # type: white, pink, brown

    if noise_type == 'white':
        beta = 0

    elif noise_type == 'pink':
        beta = 1

    elif noise_type == 'brown':
        beta = 2

    noise = cn.powerlaw_psd_gaussian(beta, s.size)

    noisy_s = s + noise * noise_percentage_factor

    # output should be at 16kHz sample rate
    if sample_rate != 16000:
        noisy_s = librosa.resample(noisy_s, orig_sr = sample_rate, target_sr=16000)

    return noisy_s


def add_signals(s, back_s, sample_rate=16000, back_sample_rate=16000, noise_db=-12):
    # s: audio input (mono)
    # back_s: brckgrnd audio
    # sample rate: sample rate of s
    # noise_db: lower the backgrnd signal by noise_db db


    # make sure both signals have same 16kHz sample rate
    if sample_rate != 16000:
        s = librosa.resample(s, orig_sr=sample_rate, target_sr=16000)

    if back_sample_rate != 16000:
        back_s = librosa.resample(back_s, orig_sr=back_sample_rate, target_sr=16000)

    if s.size > back_s.size:
        back_s = librosa.util.pad_center(back_s, size=s.size)

    elif s.size < back_s.size:
        s = librosa.util.pad_center(s, size=back_s.size)

    # lower background signal by noise_db
    noise_amp = librosa.db_to_amplitude(noise_db)
    lower_back_s = back_s - noise_amp

    # add background noise to sound clip
    noisy_s = s + back_s

    # output should be at 16kHz sample rate
    return noisy_s


def down_sample(s, sample_rate=16000, output_sr=8000):
    # s: audio input (mono)
    # sample rate: sample rate of s
    # output_sr: output sample rate

    # resample to output_sr
    resampled_s = librosa.resample(s, orig_sr=sample_rate, target_sr=output_sr)

    # then re-resample to 16000
    noisy_s = librosa.resample(resampled_s, orig_sr=output_sr, target_sr=16000)

    # output should be at 16kHz sample rate
    return noisy_s

In [3]:
import os
print(os.environ['CONDA_DEFAULT_ENV'])

base


In [4]:
!pip install git+https://github.com/openai/whisper.git 

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-o9wbjgup
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-o9wbjgup
  Resolved https://github.com/openai/whisper.git to commit d18e9ea5dd2ca57c697e8e55f9e654f06ede25d0
  Preparing metadata (setup.py) ... [?25ldone


# Base model testing

## English-only model

In [5]:
import whisper

model = whisper.load_model("base")

# load audio and pad/trim it to fit 30 seconds
#audio = whisper.load_audio("audio.mp3")
#audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
#mel = whisper.log_mel_spectrogram(audio).to(model.device)

# detect the spoken language
#_, probs = model.detect_language(mel)
#print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
#options = whisper.DecodingOptions(fp16 = False)
#result = whisper.decode(model, mel, options)

# print the recognized text
#print(result.text)

### evaluation on LibriSpeech

In [7]:
# load dataset
# installing takes ~30 seconds
import os
import numpy as np

try:
    import tensorflow as tf  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

class LibriSpeech(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
    It will drop the last few seconds of a very small portion of the utterances.
    MODIFIED: for white noise 
    """
    def __init__(self, split="test-clean", device=DEVICE):
        self.dataset = torchaudio.datasets.LIBRISPEECH(
            root=os.path.expanduser("~/.cache"),
            url=split,
            download=True,
        )
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        audio, sample_rate, text, _, _, _ = self.dataset[item]
        assert sample_rate == 16000

        audio_clean = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel_c = whisper.log_mel_spectrogram(audio_clean)

        audio_noise = add_noise(audio.numpy(), sample_rate=sample_rate) # default white noise
        audio_noise = torch.from_numpy(audio_noise.astype('float32'))
        audio_noise = whisper.pad_or_trim(audio_noise.flatten()).to(self.device)
        mel_n = whisper.log_mel_spectrogram(audio_noise)

        audio_ds = down_sample(audio.numpy().flatten(), sample_rate=sample_rate) # default params
        audio_ds = torch.from_numpy(audio_ds.astype('float32'))
        audio_ds = whisper.pad_or_trim(audio_ds.flatten()).to(self.device)
        mel_d = whisper.log_mel_spectrogram(audio_ds)
        
        return (mel_c, mel_n, mel_d, text)
    
dataset = LibriSpeech("test-clean")
loader = torch.utils.data.DataLoader(dataset, batch_size=16)

  0%|          | 0.00/331M [00:00<?, ?B/s]

In [8]:
# load base model and review
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

100%|███████████████████████████████████████| 139M/139M [00:07<00:00, 20.6MiB/s]


Model is English-only and has 71,825,408 parameters.


In [9]:
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True, fp16 = False)

In [10]:
# base model + 2619 samples will take 12 hours....
hypotheses_c = []
hypotheses_n = []
hypotheses_d = []
references = []

for i, (mels_c, mels_n, mels_d, texts) in enumerate(tqdm(loader)):
    results_c = model.decode(mels_c, options)
    hypotheses_c.extend([result.text for result in results_c])
    results_n = model.decode(mels_n, options)
    hypotheses_n.extend([result.text for result in results_n])
    results_d = model.decode(mels_d, options)
    hypotheses_d.extend([result.text for result in results_d])
    
    references.extend(texts)

  0%|          | 0/164 [00:00<?, ?it/s]

In [11]:
# inspect on testing results
data_clean = pd.DataFrame(dict(hypothesis=hypotheses_c, reference=references))
data_noise = pd.DataFrame(dict(hypothesis=hypotheses_n, reference=references))
data_downs = pd.DataFrame(dict(hypothesis=hypotheses_d, reference=references))

In [12]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

def clean_text(normalizer, pred_dict):
  pred_dict["hypothesis_clean"] = [normalizer(text) for text in pred_dict["hypothesis"]]
  pred_dict["reference_clean"] = [normalizer(text) for text in pred_dict["reference"]]
  return(pred_dict)

normalizer = EnglishTextNormalizer()
data_clean = clean_text(normalizer, data_clean)
data_noise = clean_text(normalizer, data_noise)
data_downs = clean_text(normalizer, data_downs)

data_clean

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...,he hoped there would be stew for dinner turnip...,he hoped there would be stew for dinner turnip...
1,"Stuffered into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM,stuffered into you his belly counseled him,stuff it into you his belly counseled him
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...,after early nightfall the yellow lamps would l...,after early nightfall the yellow lamps would l...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND,hello bertie any good in your mind,hello bertie any good in your mind
4,Number 10. Fresh Nelly is waiting on you. Good...,NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...,number 10 fresh nelly is waiting on you good n...,number 10 fresh nelly is waiting on you good n...
...,...,...,...,...
2615,"Oh, to shoot my soul's full meaning into futur...",OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...,0 to shoot my soul is full meaning into future...,0 to shoot my soul is full meaning into future...
2616,"Then I, long tried by natural ills, received t...",THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...,then i long tried by natural ills received the...,then i long tried by natural ills received the...
2617,I love thee freely as men strive for right. I ...,I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...,i love thee freely as men strive for right i l...,i love thee freely as men strive for right i l...
2618,"I love thee with the passion put to use, in my...",I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...,i love thee with the passion put to use in my ...,i love thee with the passion put to use in my ...


In [13]:
wer_c = jiwer.wer(list(data_clean["reference_clean"]), list(data_clean["hypothesis_clean"]))
wer_n = jiwer.wer(list(data_noise["reference_clean"]), list(data_noise["hypothesis_clean"]))
wer_d = jiwer.wer(list(data_downs["reference_clean"]), list(data_downs["hypothesis_clean"]))

print(f"Clean WER: {wer_c * 100:.2f} %")
print(f"White noise WER: {wer_n * 100:.2f} %")
print(f"Downsampled WER: {wer_d * 100:.2f} %")

Clean WER: 4.27 %
White noise WER: 6.22 %
Downsampled WER: 4.79 %
