### testing the base wav2vec model from the huggingface transformer api

In [1]:
# download & extract test dataset from librispeech
import urllib.request
import tarfile
import os
if not os.path.exists('./data/LibriSpeech'):
    print('downloading test dataset (300mb)')
    urllib.request.urlretrieve("https://www.openslr.org/resources/12/test-clean.tar.gz", "test.tar.gz")

    print('extracting data')
    file = tarfile.open('test.tar.gz')
    file.extractall('./data')
    file.close()

In [2]:
# source from https://colab.research.google.com/drive/1dnNrGy1U260L403OuhTsDjBQkdGHmvL9
#import jiwer
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import nltk
import librosa
import torch
import soundfile as sf
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/antonin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def load_wav2vec_960h_model():
    """
    Returns the tokenizer and the model from pretrained tokenizers models
    """
    tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")    
    return tokenizer, model

def correct_uppercase_sentence(input_text): 
    """
    Returns the corrected sentence
    """  
    sentences = nltk.sent_tokenize(input_text)
    return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))

In [4]:
def asr_transcript(tokenizer, model, input_file):
    """
    Returns the transcript of the input audio recording

    Output: Transcribed text
    Input: Huggingface tokenizer, model and wav file
    """
    #read the file
    speech, samplerate = sf.read(input_file)
    #make it 1-D
    if len(speech.shape) > 1: 
        speech = speech[:,0] + speech[:,1]
    #Resample to 16khz
    if samplerate != 16000:
        speech = librosa.resample(speech, samplerate, 16000)
    #tokenize
    input_values = tokenizer(speech, return_tensors="pt").input_values
    #take logits
    logits = model(input_values).logits
    #take argmax (find most probable word id)
    predicted_ids = torch.argmax(logits, dim=-1)
    #get the words from the predicted word ids
    transcription = tokenizer.decode(predicted_ids[0])
    #output is all uppercase, make only the first letter in first word capitalized
    # commented out the next line to have proper computation of WER (transcript is already fully capitalized)
    #transcription = correct_uppercase_sentence(transcription.lower())
    return transcription

In [5]:
#loading the models
tokenizer, model = load_wav2vec_960h_model()

#transcript
f = open("./data/LibriSpeech/test-clean/61/70968/61-70968.trans.txt", "r")
lines= str.splitlines(f.read())

#goes through the audio files in the chapter
for i in range(0,63):
    filename='./data/LibriSpeech/test-clean/61/70968/61-70968-00'+str(i).zfill(2)
    print(lines[i].split(' ', 1)[1])
    txt=lines[i].split(' ', 1)[1]
    wav_input = filename+'.flac'
    
    text = asr_transcript(tokenizer,model,wav_input)
    print(text)
    print('-----------------------------------')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HE BEGAN A CONFUSED COMPLAINT AGAINST THE WIZARD WHO HAD VANISHED BEHIND THE CURTAIN ON THE LEFT
HE BEGAN A CONFUSED COMPLAINT AGAINST THE WIZARD WHO HAD VANISHED BEHIND THE CURTAIN ON THE LEFT
-----------------------------------
GIVE NOT SO EARNEST A MIND TO THESE MUMMERIES CHILD
GIVE NOT SO EARNEST A MIND TO THESE MUMMORIES CHILD
-----------------------------------
A GOLDEN FORTUNE AND A HAPPY LIFE
A GOLDEN FORTUNE AND A HAPPY LIFE
-----------------------------------
HE WAS LIKE UNTO MY FATHER IN A WAY AND YET WAS NOT MY FATHER
HE WAS LIKE UNTO MY FATHER IN A WAY AND YET WAS NOT MY FATHER
-----------------------------------
ALSO THERE WAS A STRIPLING PAGE WHO TURNED INTO A MAID
ALSO THERE WAS A STRIPPLING PAGE WHO TURNED INTO A MAID
-----------------------------------
THIS WAS SO SWEET A LADY SIR AND IN SOME MANNER I DO THINK SHE DIED
THIS WAS SO SWEET A LADY SIR AND IN SOME MANNER I DO THINK SHE DIED
-----------------------------------
BUT THEN THE PICTURE WAS GONE AS QUICKLY AS IT 

## test compute WER

In [6]:
from wer import wer

In [7]:
#transcript
f = open("./data/LibriSpeech/test-clean/61/70968/61-70968.trans.txt", "r")
lines= str.splitlines(f.read())

#goes through the audio files in the chapter
for i in range(0,63):
    filename='./data/LibriSpeech/test-clean/61/70968/61-70968-00'+str(i).zfill(2)
    #print(lines[i].split(' ', 1)[1])
    txt=lines[i].split(' ', 1)[1]
    wav_input = filename+'.flac'
    
    text = asr_transcript(tokenizer,model,wav_input)
    #print(text)
    wer(txt.split(' '), text.split(' '))
    print('-----------------------------------')

REF: HE BEGAN A CONFUSED COMPLAINT AGAINST THE WIZARD WHO HAD VANISHED BEHIND THE CURTAIN ON THE LEFT 
HYP: HE BEGAN A CONFUSED COMPLAINT AGAINST THE WIZARD WHO HAD VANISHED BEHIND THE CURTAIN ON THE LEFT 
EVA:                                                                                                  
WER: 0.00%
-----------------------------------
REF: GIVE NOT SO EARNEST A MIND TO THESE MUMMERIES CHILD 
HYP: GIVE NOT SO EARNEST A MIND TO THESE MUMMORIES CHILD 
EVA:                                     S               
WER: 10.00%
-----------------------------------
REF: A GOLDEN FORTUNE AND A HAPPY LIFE 
HYP: A GOLDEN FORTUNE AND A HAPPY LIFE 
EVA:                                   
WER: 0.00%
-----------------------------------
REF: HE WAS LIKE UNTO MY FATHER IN A WAY AND YET WAS NOT MY FATHER 
HYP: HE WAS LIKE UNTO MY FATHER IN A WAY AND YET WAS NOT MY FATHER 
EVA:                                                               
WER: 0.00%
-----------------------------------
REF: