### Baseline raw Wav2Vec2 WER on LibriSpeech clean-test

In [1]:
from datasets import load_dataset
from jiwer import wer
import librosa
import nltk
import os
import tarfile
import torch
import urllib.request
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/antonin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# set paths
datasets_path = os.path.join(os.getcwd(), 'datasets') 
# create folders if they do not already exist
if not os.path.exists(datasets_path): os.makedirs(datasets_path)
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def download_and_extract_dataset_from_url(url: str, datasets_path: str = datasets_path):
    """
    downloads and extracts dataset from url into datasets_path/
    """
    temp = os.path.join(datasets_path, url.split('/')[-1])
    print('downloading dataset...')
    urllib.request.urlretrieve(url, temp)
    print('extracting data...')
    file = tarfile.open(temp)
    file.extractall(datasets_path)
    file.close()
    os.remove(temp)
    print('done.')

In [4]:
download_and_extract_dataset_from_url('https://www.openslr.org/resources/12/test-clean.tar.gz')

downloading dataset...
extracting data...
done.


In [None]:
librispeech_eval = load_dataset("datasets/LibriSpeech", "clean", split="test")

In [4]:
def load_wav2vec_model(hf_path: str):
    """
    load and return wav2vec tokenizer and model from huggingface
    """
    tokenizer = Wav2Vec2Tokenizer.from_pretrained(hf_path)
    model = Wav2Vec2ForCTC.from_pretrained(hf_path)    
    return tokenizer, model

In [36]:
def map_to_text(batch):
    """
    inserts ground truth in dataset
    """
    transcription_file_path = batch['audio']['path'][:-10] + '.trans.txt'
    f = open(transcription_file_path, 'r')
    lines= str.splitlines(f.read())
    txt=lines[int(batch['audio']['path'][-7:-5])].split(' ', 1)[1]
    batch['txt'] = txt
    return batch

In [37]:
librispeech_eval = librispeech_eval.map(map_to_text)

100%|██████████| 2620/2620 [03:02<00:00, 14.32ex/s]


In [44]:
tokenizer, model = load_wav2vec_model("facebook/wav2vec2-base-960h")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def map_to_pred(batch):
    #tokenize
    input_values = tokenizer(batch["audio"]["array"], return_tensors="pt").input_values
    #take logits
    logits = model(input_values).logits
    #take argmax (find most probable word id)
    predicted_ids = torch.argmax(logits, dim=-1)
    #get the words from the predicted word ids
    transcription = tokenizer.decode(predicted_ids[0])
    batch["transcription"] = transcription
    return batch

In [49]:
result = librispeech_eval.map(map_to_pred)

print("WER:", wer(result["txt"], result["transcription"]))

100%|██████████| 2620/2620 [14:33<00:00,  3.00ex/s]   


WER: 0.0338557516737675
