Wav2Vec 2.0 WER on LibriSpeech clean-test with added noise and LibriSpeech other-test

In [None]:
pip install datasets jiwer transformers colorednoise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_dataset
from jiwer import wer
import librosa
import nltk
import os
import tarfile
import torch
import urllib.request
import soundfile as sf
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC, Wav2Vec2Tokenizer, Wav2Vec2Processor, AutoModelForCTC, AutoProcessor
nltk.download('punkt')

import audio_preprocess

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# set paths
datasets_path = os.path.join(os.getcwd(), 'datasets') 
# create folders if they do not already exist
if not os.path.exists(datasets_path): os.makedirs(datasets_path)
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def download_and_extract_dataset_from_url(url: str, datasets_path: str = datasets_path):
    """
    downloads and extracts dataset from url into datasets_path/
    """
    temp = os.path.join(datasets_path, url.split('/')[-1])
    urllib.request.urlretrieve(url, temp)
    file = tarfile.open(temp)
    file.extractall(datasets_path)
    file.close()
    os.remove(temp)

In [None]:
download_and_extract_dataset_from_url('https://www.openslr.org/resources/12/test-clean.tar.gz')
download_and_extract_dataset_from_url('https://www.openslr.org/resources/12/test-other.tar.gz')

In [None]:
# load extracted lr data as dataset
librispeech_clean = load_dataset("datasets/LibriSpeech/test-clean", "clean", split='train')
librispeech_other = load_dataset("datasets/LibriSpeech/test-other", "other", split='train')

Resolving data files:   0%|          | 0/2707 [00:00<?, ?it/s]



Resolving data files:   0%|          | 0/3029 [00:00<?, ?it/s]



In [None]:
def map_to_ground_truth(batch):
    """
    inserts ground truth in dataset
    """
    transcription_file_path = batch['audio']['path'][:-10] + '.trans.txt'
    f = open(transcription_file_path, 'r')
    lines = str.splitlines(f.read())
    txt = lines[int(batch['audio']['path'][-7:-5])].split(' ', 1)[1]
    batch['txt'] = txt
    return batch

In [None]:
librispeech_clean = librispeech_clean.map(map_to_ground_truth)
librispeech_other = librispeech_other.map(map_to_ground_truth)



In [None]:
def add_noise_to_dataset(batch):
    """
    adds noise to dataset
    """
    batch['audio']['array'] = audio_preprocess.add_noise(batch['audio']['array'])
    return batch

In [None]:
librispeech_clean_noisy = librispeech_clean.map(add_noise_to_dataset)

  0%|          | 0/2620 [00:00<?, ?ex/s]

In [None]:
def downsample_dataset(batch):
    """
    downsamples dataset
    """
    batch['audio']['array'] = audio_preprocess.down_sample(batch['audio']['array'])
    return batch

In [None]:
librispeech_clean_downsampled = librispeech_clean.map(downsample_dataset)



In [None]:
def load_wav2vec_model(hf_path: str):
    """
    load and return wav2vec tokenizer and model from huggingface
    """
    tokenizer = Wav2Vec2Tokenizer.from_pretrained(hf_path)
    model = Wav2Vec2ForCTC.from_pretrained(hf_path).to(device)
    return tokenizer, model

In [None]:
tokenizer, model = load_wav2vec_model("facebook/wav2vec2-base-960h")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def map_to_pred(batch):
    """
    predicts transcription
    """
    #tokenize
    input_values = tokenizer(batch["audio"]["array"], return_tensors="pt").input_values
    #take logits
    logits = model(input_values.to(device)).logits
    #take argmax (find most probable word id)
    predicted_ids = torch.argmax(logits, dim=-1)
    #get the words from the predicted word ids
    transcription = tokenizer.decode(predicted_ids[0])
    batch["transcription"] = transcription
    return batch

In [None]:
ls_clean_result = librispeech_clean.map(map_to_pred)
ls_other_result = librispeech_other.map(map_to_pred)
ls_clean_noisy_result = librispeech_clean_noisy.map(map_to_pred)
ls_clean_downsampled_result = librispeech_clean_downsampled.map(map_to_pred)

  0%|          | 0/2620 [00:00<?, ?ex/s]

  0%|          | 0/2939 [00:00<?, ?ex/s]

  0%|          | 0/2620 [00:00<?, ?ex/s]

  0%|          | 0/2620 [00:00<?, ?ex/s]

In [None]:
def calculate_wer(text, transcription, decimal=1):
  return round(100 * wer(text, transcription), decimal)

In [None]:
print('WER: wav2vec2-base-960h, ls-test-clean:', calculate_wer(ls_clean_result["txt"], ls_clean_result["transcription"]), '%.')
print('WER: wav2vec2-base-960h, ls-test-other:', calculate_wer(ls_other_result["txt"], ls_other_result["transcription"]), '%.')
print('WER: wav2vec2-base-960h, ls-test-clean, noisy:', calculate_wer(ls_clean_noisy_result["txt"], ls_clean_noisy_result["transcription"]), '%.')
print('WER: wav2vec2-base-960h, ls-test-clean, downsampled:', calculate_wer(ls_clean_downsampled_result["txt"], ls_clean_downsampled_result["transcription"]), '%.')

WER: wav2vec2-base-960h, ls-test-clean: 3.4 %.
WER: wav2vec2-base-960h, ls-test-other: 9.3 %.
WER: wav2vec2-base-960h, ls-test-clean, noisy: 8.2 %.
WER: wav2vec2-base-960h, ls-test-clean, downsampled: 4.2 %.


#Test on 4-gram

In [None]:
tokenizer4g, model4g = load_wav2vec_model("patrickvonplaten/wav2vec2-base-960h-4-gram")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at patrickvonplaten/wav2vec2-base-960h-4-gram and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def map_to_pred_4g(batch):
    """
    predicts transcription
    """
    #tokenize
    input_values = tokenizer4g(batch["audio"]["array"], return_tensors="pt").input_values
    #take logits
    logits = model4g(input_values.to(device)).logits
    #take argmax (find most probable word id)
    predicted_ids = torch.argmax(logits, dim=-1)
    #get the words from the predicted word ids
    transcription = tokenizer4g.decode(predicted_ids[0])
    batch["transcription4g"] = transcription
    return batch

In [None]:
ls4g_clean_result = librispeech_clean.map(map_to_pred_4g)
# ls4g_other_result = librispeech_other.map(map_to_pred_4g)
# ls4g_clean_noisy_result = librispeech_clean_noisy.map(map_to_pred_4g)
# ls4g_clean_downsampled_result = librispeech_clean_downsampled.map(map_to_pred_4g)

  0%|          | 0/2620 [00:00<?, ?ex/s]

In [None]:
print('WER: wav2vec2-base-960h-4-gram, ls-test-clean:', calculate_wer(ls4g_clean_result["txt"], ls4g_clean_result["transcription4g"]), '%.')
# print('WER: wav2vec2-base-960h-4-gram, ls-test-other:', calculate_wer(ls4g_other_result["txt"], ls4g_other_result["transcription"]), '%.')
# print('WER: wav2vec2-base-960h-4-gram, ls-test-clean, noisy:', calculate_wer(ls4g_clean_noisy_result["txt"], ls4g_clean_noisy_result["transcription"]), '%.')
# print('WER: wav2vec2-base-960h-4-gram, ls-test-clean, downsampled:', calculate_wer(ls4g_clean_downsampled_result["txt"], ls4g_clean_downsampled_result["transcription"]), '%.')

WER: wav2vec2-base-960h-4-gram, ls-test-clean: 3.4 %.


In [None]:
import pandas as pd
train_names = ['ls-test-clean', 'ls-test-other', 'ls-test-clean, noisy', 'ls-test-clean, downsampled'] * 2
wers = [calculate_wer(ls_clean_result["txt"], ls_clean_result["transcription"]),
        calculate_wer(ls_other_result["txt"], ls_other_result["transcription"]),
        calculate_wer(ls_clean_noisy_result["txt"], ls_clean_noisy_result["transcription"]),
        calculate_wer(ls_clean_downsampled_result["txt"], ls_clean_downsampled_result["transcription"]),
        calculate_wer(ls4g_clean_result["txt"], ls4g_clean_result["transcription"]),
        calculate_wer(ls4g_other_result["txt"], ls4g_other_result["transcription"]),
        calculate_wer(ls4g_clean_noisy_result["txt"], ls4g_clean_noisy_result["transcription"]),
        calculate_wer(ls4g_clean_downsampled_result["txt"], ls4g_clean_downsampled_result["transcription"])]

results = pd.DataFrame({'Model': ['wav2vec2-base-960h'] * 4 + ['wav2vec2-base-960h-4-gram'] * 4, 'Test Data': train_names, 'WER': wers})
results

Unnamed: 0,Model,Test Data,WER
0,wav2vec2-base-960h,ls-test-clean,3.4
1,wav2vec2-base-960h,ls-test-other,9.3
2,wav2vec2-base-960h,"ls-test-clean, noisy",8.2
3,wav2vec2-base-960h,"ls-test-clean, downsampled",4.2
