# Testing Whisper's Robustness to Downsampled Librispeech

using Hugging Face version [Whisper model](https://huggingface.co/docs/transformers/model_doc/whisper)

In [10]:
import utils
import os

In [4]:
from multiprocessing import set_start_method, cpu_count
set_start_method("spawn")
num_cpus = cpu_count()
print('{} available cpus'.format(num_cpus))

10 available cpus


In [None]:
# set paths for input/output
datasets_path = os.path.join('/home/sivan/datasets')
predictions_path = os.path.join('/home/sivan/predictions')
# create folders if they do not already exist
if not os.path.exists(datasets_path): os.makedirs(datasets_path)
if not os.path.exists(predictions_path): os.makedirs(predictions_path)

## Getting 1000~16000Hz downsampled data

In [3]:
# Loading data from bucket https://console.cloud.google.com/storage/browser/capstone_datasets/librispeech/test/predictions;tab=objects?project=ecbm4040-an3078-326401&pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false
# downsampled data saved by wav2vec-downsample branch https://github.com/anhvung/Capstone-Audio-Transcription/blob/wav2vec-downsample/wav2vec/downsample.ipynb
# 9.4GB in total taking ~60s
!gsutil -m cp -n -r gs://capstone_datasets/librispeech/test/predictions/* .

Skipping existing item: file://./lr_clean_test_ds_1000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./lr_clean_test_ds_1000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./lr_clean_test_ds_1000Hz_w2v2_base_960h/state.json
Skipping existing item: file://./lr_clean_test_ds_16000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./lr_clean_test_ds_16000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./lr_clean_test_ds_16000Hz_w2v2_base_960h/state.json
Skipping existing item: file://./lr_clean_test_ds_2000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./lr_clean_test_ds_2000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./lr_clean_test_ds_4000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./lr_clean_test_ds_4000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./lr_clean_test_ds_2000Hz_w2v2_base_960h/state.json
Skipping existing item: file://./

In [4]:
!ls

'Whisper Exploration-noisysub.ipynb'	   lr_clean_test_ns_1%_w2v2_base_960h
'Whisper Exploration.ipynb'		   lr_clean_test_ns_2%_w2v2_base_960h
 __pycache__				   lr_clean_test_ns_3%_w2v2_base_960h
 audio.mp3				   lr_clean_test_ns_4%_w2v2_base_960h
 datasets				   lr_clean_test_ns_5%_w2v2_base_960h
 lr_clean_test_ds_1000Hz_w2v2_base_960h    lr_clean_test_ns_6%_w2v2_base_960h
 lr_clean_test_ds_16000Hz_w2v2_base_960h   lr_clean_test_w2v2_base_960h
 lr_clean_test_ds_2000Hz_w2v2_base_960h    predictions
 lr_clean_test_ds_4000Hz_w2v2_base_960h    requirements.txt
 lr_clean_test_ds_500Hz_w2v2_base_960h	  'robustness on downsampling.ipynb'
 lr_clean_test_ds_8000Hz_w2v2_base_960h    utils.py
 lr_clean_test_ns_0%_w2v2_base_960h


In [None]:
# 6 types of sample rates
sr = [500, 1000, 2000, 4000, 8000, 16000]
datasets = {}

# load datasets and remove results
for i in sr:
    datasets[i] = utils.load_from_disk(utils.os.path.join(datasets_path, 'lr_clean_test_ds_{}Hz_w2v2_base_960h'.format(i)))
    datasets[i].remove_columns(['logits', 'transcription', 'label'])

In [16]:
datasets[500]

Dataset({
    features: ['audio', 'label', 'ground_truth', 'logits', 'transcription'],
    num_rows: 2620
})

## Testing on Whisper-base.enb

In [2]:
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-0c6zn4_b
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-0c6zn4_b
  Resolved https://github.com/openai/whisper.git to commit 9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d
  Preparing metadata (setup.py) ... [?25ldone
Collecting more-itertools
  Downloading more_itertools-9.0.0-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 KB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting ffmpeg-python==0.2.0
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m829.2/829.2 KB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: whis

In [5]:
# import
import torch
import pandas as pd
import whisper
import numpy as np
import torchaudio
from tqdm.notebook import tqdm

In [6]:
# load base model and review
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is English-only and has 71,825,408 parameters.


In [7]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cpu


In [18]:
# load base model and processor
processor, model = utils.load_whisper("openai/whisper-base.en")

Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/844 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/999k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/151M [00:00<?, ?B/s]

In [8]:
# compute prediction for all datasets

results = {}
for i in sr:
    results[i] = datasets[i].map(utils.map_to_pred,
                                 num_proc=num_cpus,
                                 fn_kwargs={"model": model, "processor": processor},
                                 batched=True, batch_size=16)
    results.save_to_disk(utils.os.path.join(predictions_path, 'lr_clean_test_ds_' + str(i) + 'Hz_whisper_base.en'))

!gsutil -m cp -n -r ./predictions/ gs://capstone_datasets/librispeech/test/whisper_downsample/

Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/844 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/999k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/290M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# WER (reference, hypothesis_clean)