# Whisper Model
modified notebook by Sivan Ding (sd5397)

Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.

Code below adapted from [github repo](https://github.com/openai/whisper)

## Requirement
Python 3.9.9 and PyTorch 1.10.1

In [1]:
!pip install colorednoise ffmpeg torchaudio jiwer



In [2]:
!pip install git+https://github.com/openai/whisper.git 

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-m6zw9b__
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-m6zw9b__
  Resolved https://github.com/openai/whisper.git to commit 9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d
  Preparing metadata (setup.py) ... [?25ldone


In [3]:
# Julia's code

import librosa
import numpy as np
import colorednoise as cn


def add_noise(s, sample_rate=16000, noise_percentage_factor = .01, noise_type='white'):
    # s: audio input (mono)
    # sample rate: sample rate of s
    # noise_percentage_factor, percentage scale of added noise added
    # type: white, pink, brown

    if noise_type == 'white':
        beta = 0

    elif noise_type == 'pink':
        beta = 1

    elif noise_type == 'brown':
        beta = 2

    noise = cn.powerlaw_psd_gaussian(beta, s.size)

    noisy_s = s + noise * noise_percentage_factor

    # output should be at 16kHz sample rate
    if sample_rate != 16000:
        noisy_s = librosa.resample(noisy_s, orig_sr = sample_rate, target_sr=16000)

    return noisy_s


def add_signals(s, back_s, sample_rate=16000, back_sample_rate=16000, noise_db=-12):
    # s: audio input (mono)
    # back_s: brckgrnd audio
    # sample rate: sample rate of s
    # noise_db: lower the backgrnd signal by noise_db db


    # make sure both signals have same 16kHz sample rate
    if sample_rate != 16000:
        s = librosa.resample(s, orig_sr=sample_rate, target_sr=16000)

    if back_sample_rate != 16000:
        back_s = librosa.resample(back_s, orig_sr=back_sample_rate, target_sr=16000)

    if s.size > back_s.size:
        back_s = librosa.util.pad_center(back_s, size=s.size)

    elif s.size < back_s.size:
        s = librosa.util.pad_center(s, size=back_s.size)

    # lower background signal by noise_db
    noise_amp = librosa.db_to_amplitude(noise_db)
    lower_back_s = back_s - noise_amp

    # add background noise to sound clip
    noisy_s = s + back_s

    # output should be at 16kHz sample rate
    return noisy_s


def down_sample(s, sample_rate=16000, output_sr=8000):
    # s: audio input (mono)
    # sample rate: sample rate of s
    # output_sr: output sample rate

    # resample to output_sr
    resampled_s = librosa.resample(s, orig_sr=sample_rate, target_sr=output_sr)

    # then re-resample to 16000
    noisy_s = librosa.resample(resampled_s, orig_sr=output_sr, target_sr=16000)

    # output should be at 16kHz sample rate
    return noisy_s

In [4]:
import os
print(os.environ['CONDA_DEFAULT_ENV'])

base


# Base model testing

### evaluation on LibriSpeech

In [5]:
# load dataset
# installing takes ~30 seconds

try:
    import tensorflow as tf  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

class LibriSpeech(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
    It will drop the last few seconds of a very small portion of the utterances.
    """
    def __init__(self, split="test-clean", device=DEVICE):
        self.dataset = torchaudio.datasets.LIBRISPEECH(
            root=os.path.expanduser("~/.cache"),
            url=split,
            download=True,
        )
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        audio, sample_rate, text, _, _, _ = self.dataset[item]
        assert sample_rate == 16000
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)
        
        return (mel, text)

class LibriSpeechNoisy(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
    It will drop the last few seconds of a very small portion of the utterances.
    MODIFIED: for added noise 
    """
    def __init__(self, split="test-clean", device=DEVICE):
        self.dataset = torchaudio.datasets.LIBRISPEECH(
            root=os.path.expanduser("~/.cache"),
            url=split,
            download=True,
        )
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        audio, sample_rate, text, _, _, _ = self.dataset[item]
        assert sample_rate == 16000

        audio_clean = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel_c = whisper.log_mel_spectrogram(audio_clean)

        audio_noise = add_noise(audio.numpy(), sample_rate=sample_rate) # default white noise
        audio_noise = torch.from_numpy(audio_noise.astype('float32'))
        audio_noise = whisper.pad_or_trim(audio_noise.flatten()).to(self.device)
        mel_n = whisper.log_mel_spectrogram(audio_noise)

        audio_ds = down_sample(audio.numpy().flatten(), sample_rate=sample_rate) # default params
        audio_ds = torch.from_numpy(audio_ds.astype('float32'))
        audio_ds = whisper.pad_or_trim(audio_ds.flatten()).to(self.device)
        mel_d = whisper.log_mel_spectrogram(audio_ds)
        
        return (mel_c, mel_n, mel_d, text) 

In [6]:
dataset = LibriSpeechNoisy("test-clean")
loader = torch.utils.data.DataLoader(dataset, batch_size=16)

In [10]:
%%time
# load base model and review
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is English-only and has 71,825,408 parameters.
CPU times: user 1.81 s, sys: 176 ms, total: 1.99 s
Wall time: 1.66 s


In [8]:
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True, fp16 = False)

In [11]:
%%time

hypotheses_c = []
hypotheses_n = []
hypotheses_d = []
references = []

for i, (mels_c, mels_n, mels_d, texts) in enumerate(tqdm(loader)):
    results_c = model.decode(mels_c, options)
    hypotheses_c.extend([result.text for result in results_c])
    results_n = model.decode(mels_n, options)
    hypotheses_n.extend([result.text for result in results_n])
    results_d = model.decode(mels_d, options)
    hypotheses_d.extend([result.text for result in results_d])
    
    references.extend(texts)

  0%|          | 0/164 [00:00<?, ?it/s]

CPU times: user 40min 48s, sys: 9min 46s, total: 50min 35s
Wall time: 41min 27s


In [12]:
# inspect on testing results
data_clean = pd.DataFrame(dict(hypothesis=hypotheses_c, reference=references))
data_noise = pd.DataFrame(dict(hypothesis=hypotheses_n, reference=references))
data_downs = pd.DataFrame(dict(hypothesis=hypotheses_d, reference=references))

In [13]:
%%time

import jiwer
from whisper.normalizers import EnglishTextNormalizer

def clean_text(normalizer, pred_dict):
  pred_dict["hypothesis_clean"] = [normalizer(text) for text in pred_dict["hypothesis"]]
  pred_dict["reference_clean"] = [normalizer(text) for text in pred_dict["reference"]]
  return(pred_dict)

normalizer = EnglishTextNormalizer()
data_clean = clean_text(normalizer, data_clean)
data_noise = clean_text(normalizer, data_noise)
data_downs = clean_text(normalizer, data_downs)

data_clean

CPU times: user 4.26 s, sys: 35.1 ms, total: 4.29 s
Wall time: 4.36 s


Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...,he hoped there would be stew for dinner turnip...,he hoped there would be stew for dinner turnip...
1,"Stuffered into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM,stuffered into you his belly counseled him,stuff it into you his belly counseled him
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...,after early nightfall the yellow lamps would l...,after early nightfall the yellow lamps would l...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND,hello bertie any good in your mind,hello bertie any good in your mind
4,Number 10. Fresh Nelly is waiting on you. Good...,NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...,number 10 fresh nelly is waiting on you good n...,number 10 fresh nelly is waiting on you good n...
...,...,...,...,...
2615,"Oh, to shoot my soul's full meaning into futur...",OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...,0 to shoot my soul is full meaning into future...,0 to shoot my soul is full meaning into future...
2616,"Then I, long tried by natural ills, received t...",THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...,then i long tried by natural ills received the...,then i long tried by natural ills received the...
2617,I love thee freely as men strive for right. I ...,I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...,i love thee freely as men strive for right i l...,i love thee freely as men strive for right i l...
2618,"I love thee with the passion put to use, in my...",I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...,i love thee with the passion put to use in my ...,i love thee with the passion put to use in my ...


In [15]:
dataset = LibriSpeech("test-other")
loader = torch.utils.data.DataLoader(dataset, batch_size=16)

  0%|          | 0.00/314M [00:00<?, ?B/s]

In [16]:
%%time

hypotheses = []
references = []

for i, (mels, texts) in enumerate(tqdm(loader)):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)

  0%|          | 0/184 [00:00<?, ?it/s]

CPU times: user 9min 38s, sys: 3min 25s, total: 13min 4s
Wall time: 11min 41s


In [17]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,"There's iron, they say, in all our blood, and ...",THERE'S IRON THEY SAY IN ALL OUR BLOOD AND A G...
1,Margaret said Mr. Hale as he returned from sho...,MARGARET SAID MISTER HALE AS HE RETURNED FROM ...
2,You don't mean that you thought me so silly.,YOU DON'T MEAN THAT YOU THOUGHT ME SO SILLY
3,"I really like that account of himself, better ...",I REALLY LIKED THAT ACCOUNT OF HIMSELF BETTER ...
4,His statement of having been a shop boy was th...,HIS STATEMENT OF HAVING BEEN A SHOP BOY WAS TH...
...,...,...
2934,"Poor Isaac was hurried off accordingly, and ex...",POOR ISAAC WAS HURRIED OFF ACCORDINGLY AND EXP...
2935,The assurance that she possessed some friend i...,THE ASSURANCE THAT SHE POSSESSED SOME FRIEND I...
2936,She gazed accordingly upon a scene which might...,SHE GAZED ACCORDINGLY UPON A SCENE WHICH MIGHT...
2937,At his feet was placed at the table occupied b...,AT HIS FEET WAS PLACED A TABLE OCCUPIED BY TWO...


In [18]:
%%time

data = clean_text(normalizer, data)
data

CPU times: user 1.74 s, sys: 0 ns, total: 1.74 s
Wall time: 1.74 s


Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,"There's iron, they say, in all our blood, and ...",THERE'S IRON THEY SAY IN ALL OUR BLOOD AND A G...,there is iron they say in all our blood and a ...,there is iron they say in all our blood and a ...
1,Margaret said Mr. Hale as he returned from sho...,MARGARET SAID MISTER HALE AS HE RETURNED FROM ...,margaret said mister hale as he returned from ...,margaret said mister hale as he returned from ...
2,You don't mean that you thought me so silly.,YOU DON'T MEAN THAT YOU THOUGHT ME SO SILLY,you do not mean that you thought me so silly,you do not mean that you thought me so silly
3,"I really like that account of himself, better ...",I REALLY LIKED THAT ACCOUNT OF HIMSELF BETTER ...,i really like that account of himself better t...,i really liked that account of himself better ...
4,His statement of having been a shop boy was th...,HIS STATEMENT OF HAVING BEEN A SHOP BOY WAS TH...,his statement of having been a shop boy was th...,his statement of having been a shop boy was th...
...,...,...,...,...
2934,"Poor Isaac was hurried off accordingly, and ex...",POOR ISAAC WAS HURRIED OFF ACCORDINGLY AND EXP...,poor isaac was hurried off accordingly and exp...,poor isaac was hurried off accordingly and exp...
2935,The assurance that she possessed some friend i...,THE ASSURANCE THAT SHE POSSESSED SOME FRIEND I...,the assurance that she possessed some friend i...,the assurance that she possessed some friend i...
2936,She gazed accordingly upon a scene which might...,SHE GAZED ACCORDINGLY UPON A SCENE WHICH MIGHT...,she gazed accordingly upon a scene which might...,she gazed accordingly upon a scene which might...
2937,At his feet was placed at the table occupied b...,AT HIS FEET WAS PLACED A TABLE OCCUPIED BY TWO...,at his feet was placed at the table occupied b...,at his feet was placed a table occupied by 2 s...


In [19]:
wer_c = jiwer.wer(list(data_clean["reference_clean"]), list(data_clean["hypothesis_clean"]))
wer_n = jiwer.wer(list(data_noise["reference_clean"]), list(data_noise["hypothesis_clean"]))
wer_d = jiwer.wer(list(data_downs["reference_clean"]), list(data_downs["hypothesis_clean"]))
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"Clean WER: {wer_c * 100:.2f} %")
print(f"White noise WER: {wer_n * 100:.2f} %")
print(f"Downsampled WER: {wer_d * 100:.2f} %")
print(f"Other WER: {wer * 100:.2f} %")

Clean WER: 4.27 %
White noise WER: 6.06 %
Downsampled WER: 4.79 %
Other WER: 10.36 %


In [2]:
# compare initial results with Wav2Vec2
# results from https://github.com/anhvung/Capstone-Audio-Transcription/blob/wav2vec/wav2vec/wav2vec_noisy.ipynb
import matplotlib.pyplot as plt
import pandas as pd


# pd.set_option("display.precision", 5)
comparison = pd.DataFrame({'Test Data': ['ls-test-clean', 'ls-test-other', 'ls-test-clean, noisy', 'ls-test-clean, downsampled'],
                           'Wav2Vec2-base-960h':[3.4, 9.3, 8.3, 4.2],
                          'Whisper-base.en': [4.3, 10.4, 6.1, 4.8],
                           'degradation ratio to clean set(Wav2Vec2)': [0.00, 1.73, 1.44, 0.24],
                           'degradation ratio to clean set(Whisper)': [0.00, 1.42, 0.46, 0.12],
                           })
# comparison.style.set_caption("WER% for Wav2Vec and Whisper under different conditions")
print("WER% for Wav2Vec and Whisper on LibriSpeech test under different conditions")
display(comparison)

WER% for Wav2Vec and Whisper on LibriSpeech test under different conditions


Unnamed: 0,Test Data,Wav2Vec2-base-960h,Whisper-base.en,degradation ratio to clean set(Wav2Vec2),degradation ratio to clean set(Whisper)
0,ls-test-clean,3.4,4.3,0.0,0.0
1,ls-test-other,9.3,10.4,1.73,1.42
2,"ls-test-clean, noisy",8.3,6.1,1.44,0.46
3,"ls-test-clean, downsampled",4.2,4.8,0.24,0.12
