# Testing Whisper's Robustness to Downsampled Librispeech

using Hugging Face version [Whisper model](https://huggingface.co/docs/transformers/model_doc/whisper)

In [1]:
from utils import *

In [2]:
set_start_method("spawn")
num_cpus = cpu_count()
print('{} available cpus'.format(num_cpus))

4 available cpus


## Getting 1000~16000Hz downsampled data

In [3]:
# Loading data from bucket https://console.cloud.google.com/storage/browser/capstone_datasets/librispeech/test/predictions;tab=objects?project=ecbm4040-an3078-326401&pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false
# downsampled data saved by wav2vec-downsample branch https://github.com/anhvung/Capstone-Audio-Transcription/blob/main/wav2vec/downsample.ipynb
# 9.4GB in total taking ~60s
!gsutil -m cp -n -r gs://capstone_datasets/librispeech/test/predictions/* ./predictions/

Skipping existing item: file://./predictions/lr_clean_test_ds_1000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./predictions/lr_clean_test_ds_1000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./predictions/lr_clean_test_ds_1000Hz_w2v2_base_960h/state.json
Skipping existing item: file://./predictions/lr_clean_test_ds_16000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./predictions/lr_clean_test_ds_16000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./predictions/lr_clean_test_ds_2000Hz_w2v2_base_960h/dataset.arrow
Skipping existing item: file://./predictions/lr_clean_test_ds_2000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./predictions/lr_clean_test_ds_16000Hz_w2v2_base_960h/state.json
Skipping existing item: file://./predictions/lr_clean_test_ds_4000Hz_w2v2_base_960h/dataset_info.json
Skipping existing item: file://./predictions/lr_clean_test_ds_2000Hz_w2v2_base_960h/state.json
Skipping e

In [4]:
!ls

'Whisper Exploration-noisysub.ipynb'   datasets
'Whisper Exploration.ipynb'	       predictions
 __pycache__			       robustness_to_downsampling.ipynb
 audio.mp3			       utils.py


In [5]:
# 6 types of sample rates
sr = [500, 1000, 2000, 4000, 8000, 16000]
datasets = {}

# load datasets and remove results
for i in sr:
    datasets[i] = load_from_disk(os.path.join(predictions_path, 'lr_clean_test_ds_{}Hz_w2v2_base_960h'.format(i)))
    datasets[i].remove_columns(['logits', 'transcription', 'label'])

In [6]:
# load base model and review
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is English-only and has 71,825,408 parameters.


In [7]:
print(device)

cuda


In [8]:
# load base model and processor
processor, model = load_whisper("openai/whisper-base.en")

In [9]:
datasets

{500: Dataset({
     features: ['audio', 'label', 'ground_truth', 'logits', 'transcription'],
     num_rows: 2620
 }),
 1000: Dataset({
     features: ['audio', 'label', 'ground_truth', 'logits', 'transcription'],
     num_rows: 2620
 }),
 2000: Dataset({
     features: ['audio', 'label', 'ground_truth', 'logits', 'transcription'],
     num_rows: 2620
 }),
 4000: Dataset({
     features: ['audio', 'label', 'ground_truth', 'logits', 'transcription'],
     num_rows: 2620
 }),
 8000: Dataset({
     features: ['audio', 'label', 'ground_truth', 'logits', 'transcription'],
     num_rows: 2620
 }),
 16000: Dataset({
     features: ['audio', 'label', 'ground_truth', 'logits', 'transcription'],
     num_rows: 2620
 })}

In [10]:
batch = datasets[500][0]

In [11]:
batch

{'audio': {'path': None,
  'array': array([0.00039673, 0.00039673, 0.00042725, ..., 0.        , 0.        ,
         0.        ]),
  'sampling_rate': 500},
 'label': 19,
 'ground_truth': 'HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS AND BRUISED POTATOES AND FAT MUTTON PIECES TO BE LADLED OUT IN THICK PEPPERED FLOUR FATTENED SAUCE',
 'logits': [[[9.564332008361816,
    -23.872249603271484,
    -23.72770118713379,
    -23.67909049987793,
    0.7167125940322876,
    -1.7751439809799194,
    -0.1529458910226822,
    -1.4273263216018677,
    -2.8150970935821533,
    -0.8111014366149902,
    0.8942804336547852,
    -1.465958833694458,
    -2.876735210418701,
    -3.1003828048706055,
    -1.4441367387771606,
    -2.778733968734741,
    -2.589205265045166,
    -4.185495376586914,
    -1.4342527389526367,
    -4.302725791931152,
    -1.237879753112793,
    -2.964927911758423,
    -2.8165652751922607,
    -1.7877482175827026,
    -0.6364930868148804,
    -4.273426532745361,
    -3

In [12]:
sampling_rate = batch["audio"]['sampling_rate']

In [13]:
input_features = processor(batch["audio"]["array"], sampling_rate=16000, return_tensors="pt").input_features

In [14]:
# compute prediction for all datasets

datasets[500].map(map_to_pred, fn_kwargs={"model": model, "processor": processor})

  0%|          | 0/2620 [00:00<?, ?ex/s]



In [9]:
# compute prediction for all datasets

results = {}
for i in sr:
    results[i] = datasets[i].map(map_to_pred,
                                 fn_kwargs={"model": model, "processor": processor},
                                 batched=True, batch_size=16)
    results.save_to_disk(os.path.join(predictions_path, 'lr_clean_test_ds_' + str(i) + 'Hz_whisper_base.en'))

#!gsutil -m cp -n -r ./predictions/ gs://capstone_datasets/librispeech/test/whisper_downsample/

  0%|          | 0/164 [00:00<?, ?ba/s]

TypeError: list indices must be integers or slices, not str