# Testing Whisper's Robustness to Noisy Librispeech

using Hugging Face version [Whisper model](https://huggingface.co/docs/transformers/model_doc/whisper)

In [1]:
import utils
import transformers
import os
transformers.logging.set_verbosity_error()

In [2]:
from multiprocessing import set_start_method, cpu_count
set_start_method("spawn")
num_cpus = cpu_count()
print('{} available cpus'.format(num_cpus))

4 available cpus


In [3]:
# set paths for input/output
root = '/home/sivan'
project = '/home/sivan/asr'
datasets_path = os.path.join(root, 'datasets')
predictions_path = os.path.join(root, 'predictions')
# create folders if they do not already exist
if not os.path.exists(datasets_path): os.makedirs(datasets_path)
if not os.path.exists(predictions_path): os.makedirs(predictions_path)

## Getting 1~6% noisy data

In [4]:
# Loading data from bucket https://console.cloud.google.com/storage/browser/capstone_datasets/librispeech/test/predictions;tab=objects?project=ecbm4040-an3078-326401&pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false
# different rate of noisy data saved by wav2vec-downsample branch https://github.com/anhvung/Capstone-Audio-Transcription/blob/wav2vec-downsample/wav2vec/downsample.ipynb
# 9.4GB in total taking ~60s
os.chdir(datasets_path)
!gsutil -m cp -n -r gs://capstone_datasets/librispeech/test/predictions/* .
os.chdir(project)

Skipping existing item: file://./lr_clean_test_ds_1000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./lr_clean_test_ds_1000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./lr_clean_test_ds_1000Hz_w2v2_base_960h/state.json
Skipping existing item: file://./lr_clean_test_ds_16000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./lr_clean_test_ds_16000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./lr_clean_test_ds_2000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./lr_clean_test_ds_2000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./lr_clean_test_ds_16000Hz_w2v2_base_960h/state.json
Skipping existing item: file://./lr_clean_test_ds_4000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./lr_clean_test_ds_2000Hz_w2v2_base_960h/state.json
Skipping existing item: file://./lr_clean_test_ds_4000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./

In [5]:
# 6 rates of noise
noise = [1,2,3,4,5,6]
datasets = {}

# load datasets and remove results
for i in noise:
    datasets[i] = utils.load_from_disk(utils.os.path.join(datasets_path, 'lr_clean_test_ns_{}%_w2v2_base_960h'.format(i)))
    datasets[i] = datasets[i].remove_columns(['logits', 'transcription', 'label'])

In [6]:
# inspecting metadata
print(datasets[1])
print(datasets[1][10])

Dataset({
    features: ['audio', 'ground_truth'],
    num_rows: 2620
})
{'audio': {'path': None, 'array': array([ 0.0045166 ,  0.00357056, -0.00195312, ..., -0.00308228,
        0.00469971, -0.00299072]), 'sampling_rate': 16000}, 'ground_truth': 'WELL NOW ENNIS I DECLARE YOU HAVE A HEAD AND SO HAS MY STICK'}


## Testing on Whisper-base.enb

In [7]:
!pip install git+https://github.com/openai/whisper.git

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-6eoiq963
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-6eoiq963
  Resolved https://github.com/openai/whisper.git to commit 9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d
  Preparing metadata (setup.py) ... [?25ldone


In [7]:
# import
import torch
import pandas as pd
import whisper
import numpy as np
import torchaudio
from tqdm.notebook import tqdm

In [9]:
# load base model and review
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is English-only and has 71,825,408 parameters.


In [8]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cuda


In [10]:
# load base model and processor
processor, model = utils.load_whisper("openai/whisper-base.en")

In [13]:
%%time
# compute prediction for all datasets
results = {}
for i in noise:
    save_path = os.path.join(predictions_path, 'lr_clean_test_ns_' + str(i) + '%_whisper_base.en')
    if os.path.exists(save_path):
        print("Skipping saved prediction...")
        continue
    print("Start eval on", i, "% noisy Librispeech")
    # eval on batches
    results[i] = datasets[i].map(utils.map_to_pred,
                                 fn_kwargs={"model": model, "processor": processor},
                                 # num_proc=num_cpus,
                                 writer_batch_size=1000)
    # save results to output folder
    print("Saving dataset...")
    results[i].save_to_disk(save_path)

Skipping saved prediction...
Skipping saved prediction...
Start eval on 3 % noisy Librispeech


  0%|          | 0/2620 [00:00<?, ?ex/s]

Saving dataset...
Start eval on 4 % noisy Librispeech


  0%|          | 0/2620 [00:00<?, ?ex/s]

Saving dataset...
Start eval on 5 % noisy Librispeech


  0%|          | 0/2620 [00:00<?, ?ex/s]

Saving dataset...
Start eval on 6 % noisy Librispeech


  0%|          | 0/2620 [00:00<?, ?ex/s]

Saving dataset...
CPU times: user 1h 23min 38s, sys: 3min 38s, total: 1h 27min 16s
Wall time: 57min 22s


In [14]:
# upload results to bucket ~60s for 13.5G
os.chdir(root)
!gsutil -m cp -n -r ./predictions/ gs://capstone_datasets/librispeech/test/whisper_noise/
os.chdir(project)

Copying file://./predictions/lr_clean_test_ds_16000Hz_whisper_base.en/state.json [Content-Type=application/json]...
Copying file://./predictions/lr_clean_test_ns_6%_whisper_base.en/state.json [Content-Type=application/json]...
Copying file://./predictions/lr_clean_test_ns_6%_whisper_base.en/dataset.arrow [Content-Type=application/octet-stream]...
Copying file://./predictions/lr_clean_test_ds_16000Hz_whisper_base.en/cache-84fe5b479d8af6b6.arrow [Content-Type=application/octet-stream]...
Copying file://./predictions/lr_clean_test_ns_6%_whisper_base.en/dataset_info.json [Content-Type=application/json]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/stor

In [18]:
# load prediction datasets and calculate WER

# 6 rates of noise
noise = [1,2,3,4,5,6]
predictions = {}

# WER (reference, hypothesis_clean)
for i in noise:
    predictions[i] = utils.load_from_disk(utils.os.path.join(predictions_path, 'lr_clean_test_ns_{}%_whisper_base.en'.format(i)))
    # predictions[i] = predictions[i].map(map_trans)
    # predictions[i].save_to_disk(utils.os.path.join(predictions_path, 'lr_clean_test_ds_' + str(i) + 'Hz_whisper_base.en'))
    print('WER: Whisper_base.en, ls-test-clean-{}%:'.format(i), utils.format_wer(predictions[i]["ground_truth"], predictions[i]["transcription"]), '%.')

WER: Whisper_base.en, ls-test-clean-1%: 6.0 %.
WER: Whisper_base.en, ls-test-clean-2%: 11.0 %.
WER: Whisper_base.en, ls-test-clean-3%: 15.6 %.
WER: Whisper_base.en, ls-test-clean-4%: 24.1 %.
WER: Whisper_base.en, ls-test-clean-5%: 33.9 %.
WER: Whisper_base.en, ls-test-clean-6%: 40.1 %.
