# Testing Whisper's Robustness to Downsampled Librispeech

using Hugging Face version [Whisper model](https://huggingface.co/docs/transformers/model_doc/whisper)

In [1]:
import utils
import transformers
import os
transformers.logging.set_verbosity_error()

In [2]:
from multiprocessing import set_start_method, cpu_count
set_start_method("spawn")
num_cpus = cpu_count()
print('{} available cpus'.format(num_cpus))

4 available cpus


In [3]:
# set paths for input/output
root = '/home/sivan'
project = '/home/sivan/asr'
datasets_path = os.path.join(root, 'datasets')
predictions_path = os.path.join(root, 'predictions')
# create folders if they do not already exist
if not os.path.exists(datasets_path): os.makedirs(datasets_path)
if not os.path.exists(predictions_path): os.makedirs(predictions_path)

## Getting 1000~16000Hz downsampled data

In [4]:
# Loading data from bucket https://console.cloud.google.com/storage/browser/capstone_datasets/librispeech/test/predictions;tab=objects?project=ecbm4040-an3078-326401&pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false
# downsampled data saved by wav2vec-downsample branch https://github.com/anhvung/Capstone-Audio-Transcription/blob/wav2vec-downsample/wav2vec/downsample.ipynb
# 9.4GB in total taking ~60s
os.chdir(datasets_path)
!gsutil -m cp -n -r gs://capstone_datasets/librispeech/test/predictions/* .
os.chdir(project)

Skipping existing item: file://./lr_clean_test_ds_1000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./lr_clean_test_ds_1000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./lr_clean_test_ds_1000Hz_w2v2_base_960h/state.json
Skipping existing item: file://./lr_clean_test_ds_16000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./lr_clean_test_ds_16000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./lr_clean_test_ds_2000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./lr_clean_test_ds_2000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./lr_clean_test_ds_16000Hz_w2v2_base_960h/state.json
Skipping existing item: file://./lr_clean_test_ds_2000Hz_w2v2_base_960h/state.json
Skipping existing item: file://./lr_clean_test_ds_4000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./lr_clean_test_ds_4000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./

In [16]:
# 6 types of sample rates
sr = [500, 1000, 2000, 4000, 8000, 16000]
datasets = {}

# load datasets and remove results
for i in sr:
    datasets[i] = utils.load_from_disk(utils.os.path.join(datasets_path, 'lr_clean_test_ds_{}Hz_w2v2_base_960h'.format(i)))
    datasets[i] = datasets[i].remove_columns(['logits', 'transcription', 'label'])

In [17]:
# inspecting metadata
print(datasets[500])
print(datasets[500][10])

Dataset({
    features: ['audio', 'ground_truth'],
    num_rows: 2620
})
{'audio': {'path': None, 'array': array([-1.52587891e-04, -1.83105469e-04, -2.13623047e-04, ...,
        3.05175781e-05,  0.00000000e+00,  0.00000000e+00]), 'sampling_rate': 500}, 'ground_truth': 'WELL NOW ENNIS I DECLARE YOU HAVE A HEAD AND SO HAS MY STICK'}


## Testing on Whisper-base.enb

In [7]:
!pip install git+https://github.com/openai/whisper.git

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-6eoiq963
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-6eoiq963
  Resolved https://github.com/openai/whisper.git to commit 9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d
  Preparing metadata (setup.py) ... [?25ldone


In [8]:
# import
import torch
import pandas as pd
import whisper
import numpy as np
import torchaudio
from tqdm.notebook import tqdm

In [9]:
# load base model and review
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is English-only and has 71,825,408 parameters.


In [10]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cuda


In [11]:
# load base model and processor
processor, model = utils.load_whisper("openai/whisper-base.en")

In [12]:
%%time
# compute prediction for all datasets
results = {}
for i in sr:
    print("Start eval on", i, "Hz")
    # eval on batches
    results[i] = datasets[i].map(utils.map_to_pred,
                                 fn_kwargs={"model": model, "processor": processor},
                                 # num_proc=num_cpus,
                                 writer_batch_size=1000)
    # save results to output folder
    print("Saving dataset...")
    results[i].save_to_disk(utils.os.path.join(predictions_path, 'lr_clean_test_ds_' + str(i) + 'Hz_whisper_base.en'))

Start eval on 500 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

2022-10-22 08:36:30.579037: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-22 08:36:30.801291: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-22 08:36:31.630669: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2022-10-22 08:36:31.630810: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'li

Saving dataset...
Start eval on 1000 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

Saving dataset...
Start eval on 2000 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

Saving dataset...
Start eval on 4000 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

Saving dataset...
Start eval on 8000 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

Saving dataset...
Start eval on 16000 Hz


  0%|          | 0/2620 [00:00<?, ?ex/s]

Saving dataset...
CPU times: user 2h 17min 32s, sys: 5min 24s, total: 2h 22min 56s
Wall time: 1h 39min 39s


In [62]:
# upload results to bucket ~20s for 13.5G
os.chdir(root)
!gsutil -m cp -n -r ./predictions/ gs://capstone_datasets/librispeech/test/whisper_downsample/
os.chdir(project)

Copying file://./predictions/lr_clean_test_ds_4000Hz_whisper_base.en/cache-bbe35ccd0cabbbe8.arrow [Content-Type=application/octet-stream]...
Copying file://./predictions/lr_clean_test_ds_16000Hz_whisper_base.en/state.json [Content-Type=application/json]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://./predictions/lr_clean_test_

In [55]:
# Helper for prediction data before WER
# Remove it after modification on datasets
from whisper.normalizers import EnglishTextNormalizer
normalizer = EnglishTextNormalizer()

def map_trans(batch):
    transcription = batch['transcription'][0]
    transcription = normalizer(batch['transcription'])
    batch['transcription'] = transcription
    return batch


In [59]:
# load prediction datasets and calculate WER

# 6 types of sample rates
sr = [500, 1000, 2000, 4000, 8000, 16000]
predictions = {}

# WER (reference, hypothesis_clean)
for i in sr:
    predictions[i] = utils.load_from_disk(utils.os.path.join(predictions_path, 'lr_clean_test_ds_{}Hz_whisper_base.en'.format(i)))
    predictions[i] = predictions[i].map(map_trans)
    predictions[i].save_to_disk(utils.os.path.join(predictions_path, 'lr_clean_test_ds_' + str(i) + 'Hz_whisper_base.en'))
    print('WER: Whisper_base.en, ls-test-clean-{}Hz:'.format(i), utils.format_wer(predictions[i]["ground_truth"], predictions[i]["transcription"]), '%.')

  0%|          | 0/2620 [00:00<?, ?ex/s]

WER: Whisper_base.en, ls-test-clean-500Hz: 99.4 %.


  0%|          | 0/2620 [00:00<?, ?ex/s]

WER: Whisper_base.en, ls-test-clean-1000Hz: 305.2 %.


  0%|          | 0/2620 [00:00<?, ?ex/s]

WER: Whisper_base.en, ls-test-clean-2000Hz: 60.9 %.


  0%|          | 0/2620 [00:00<?, ?ex/s]

WER: Whisper_base.en, ls-test-clean-4000Hz: 13.2 %.


  0%|          | 0/2620 [00:00<?, ?ex/s]

WER: Whisper_base.en, ls-test-clean-8000Hz: 4.8 %.


  0%|          | 0/2620 [00:00<?, ?ex/s]

WER: Whisper_base.en, ls-test-clean-16000Hz: 4.3 %.
