In [None]:
# Run in virtual environment
# pip install jupyter-lab
# jupyter-lab

In [None]:
!pip install torch accelerate torchaudio datasets[audio]
!pip install --upgrade transformers
!pip install soundfile
!pip install librosa
!pip install jiwer

In [21]:
from IPython.display import Audio as Play_Audio
from datasets import load_dataset

from transformers import Wav2Vec2ForCTC, AutoProcessor
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch

import jiwer

In [22]:
fleurs = load_dataset("google/fleurs", "en_us", split="train", streaming=True)
dataset_iterator = iter(fleurs)

In [3]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
sample = next(dataset_iterator)
data = sample['audio']['array']
sampling_rate = sample['audio']['sampling_rate']

In [24]:
sample

{'id': 903,
 'num_samples': 108800,
 'path': None,
 'audio': {'path': 'train/10004088536354799741.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -3.15904617e-06, -3.03983688e-06, -3.27825546e-06]),
  'sampling_rate': 16000},
 'transcription': 'a tornado is a spinning column of very low-pressure air which sucks the surrounding air inward and upward',
 'raw_transcription': 'A tornado is a spinning column of very low-pressure air, which sucks the surrounding air inward and upward.',
 'gender': 1,
 'lang_id': 19,
 'language': 'English',
 'lang_group_id': 0}

In [12]:
# Play the audio
Play_Audio(data=data, rate=sampling_rate)

In [25]:
inputs = processor(data, sampling_rate=16_000, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs).logits

ids = torch.argmax(outputs, dim=-1)[0]
transcription = processor.decode(ids)

actual = sample['raw_transcription']
prediction =  transcription
metrics = jiwer.compute_measures(actual.lower(), prediction.lower())

In [26]:
actual, prediction

('A tornado is a spinning column of very low-pressure air, which sucks the surrounding air inward and upward.',
 'A TORNADO IS A SPINNING COLUMN OF VERY LOW PRESSURE AIR WHICH SUCKS THE SURROUNDING AIR INWARD AND UPWARD')

In [14]:
actual.lower(), prediction.lower()

('former u.s. speaker of the house newt gingrich came in second with 32 percent.',
 'former u a speaker of the house newt gingrich came in second with thirty two per cent')

In [46]:
ids.shape

torch.Size([383])

In [41]:
inputs.keys()

dict_keys(['input_values'])

In [22]:
transcription

'FORMER U A SPEAKER OF THE HOUSE NEWT GINGRICH CAME IN SECOND WITH THIRTY TWO PER CENT'

In [23]:
sample

{'id': 279,
 'num_samples': 122880,
 'path': None,
 'audio': {'path': 'train/10012216926115652402.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -7.60555267e-05,  8.94069672e-07,  7.99894333e-05]),
  'sampling_rate': 16000},
 'transcription': 'former u.s. speaker of the house newt gingrich came in second with 32%',
 'raw_transcription': 'Former U.S. Speaker of the House Newt Gingrich came in second with 32 percent.',
 'gender': 0,
 'lang_id': 19,
 'language': 'English',
 'lang_group_id': 0}

In [39]:
metrics

{'wer': 0.42857142857142855,
 'mer': 0.35294117647058826,
 'wil': 0.4915966386554622,
 'wip': 0.5084033613445378,
 'hits': 11,
 'substitutions': 3,
 'deletions': 0,
 'insertions': 3,
 'ops': [[AlignmentChunk(type='equal', ref_start_idx=0, ref_end_idx=1, hyp_start_idx=0, hyp_end_idx=1),
   AlignmentChunk(type='insert', ref_start_idx=1, ref_end_idx=1, hyp_start_idx=1, hyp_end_idx=2),
   AlignmentChunk(type='substitute', ref_start_idx=1, ref_end_idx=2, hyp_start_idx=2, hyp_end_idx=3),
   AlignmentChunk(type='equal', ref_start_idx=2, ref_end_idx=12, hyp_start_idx=3, hyp_end_idx=13),
   AlignmentChunk(type='insert', ref_start_idx=12, ref_end_idx=12, hyp_start_idx=13, hyp_end_idx=15),
   AlignmentChunk(type='substitute', ref_start_idx=12, ref_end_idx=14, hyp_start_idx=15, hyp_end_idx=17)]],
 'truth': [['former',
   'u.s.',
   'speaker',
   'of',
   'the',
   'house',
   'newt',
   'gingrich',
   'came',
   'in',
   'second',
   'with',
   '32',
   'percent.']],
 'hypothesis': [['former',
   

In [47]:
outputs

tensor([[[ 12.3511, -25.6951, -25.3686,  ...,  -5.6248,  -5.7909,  -5.5055],
         [ 12.8965, -25.7347, -25.4000,  ...,  -5.8282,  -6.1460,  -5.8325],
         [ 12.8795, -25.6845, -25.3434,  ...,  -5.7737,  -6.1397,  -5.7737],
         ...,
         [ 12.7404, -26.2029, -25.8405,  ...,  -5.9767,  -6.7822,  -5.7852],
         [ 11.8771, -25.3972, -25.0582,  ...,  -5.8836,  -6.0421,  -5.6338],
         [ 12.5777, -26.0328, -25.6724,  ...,  -5.9033,  -6.5464,  -5.7356]]])

In [48]:
ids

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0, 20,  0,  0,  0,  0,  8, 13,  0, 17,  0,  0,
         5, 13,  0,  0,  0,  4,  0,  0,  0, 16, 16,  0,  4,  0,  0,  7,  0,  0,
         0,  0,  4,  4, 12, 12,  0,  0,  0,  0, 23,  0,  5,  5,  0,  7,  0, 26,
         0,  0,  0,  5, 13,  0,  0,  4,  4,  8, 20,  4,  4,  4,  6, 11,  5,  0,
         4,  4, 11, 11,  0,  0,  0,  0,  8, 16,  0,  0,  0,  0, 12,  5,  5,  0,
         4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,  9,  0,  0,  0,  5,
        18,  0,  0,  0,  6,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0, 21,  0,  0,  0,  0, 10,  0,  0,  9,  0,  0,  0, 21,
         0,  0, 13,  0,  0,  0, 10,  0,  0,  0,  0, 19,  0, 11,  0,  0,  0,  4,
         4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 

In [50]:
outputs.shape, ids.shape

(torch.Size([1, 383, 32]), torch.Size([383]))

In [53]:
inputs['input_values'].shape

torch.Size([1, 122880])

In [54]:
processor.tokenizer.get_vocab()

{'<pad>': 0,
 '<s>': 1,
 '</s>': 2,
 '<unk>': 3,
 '|': 4,
 'E': 5,
 'T': 6,
 'A': 7,
 'O': 8,
 'N': 9,
 'I': 10,
 'H': 11,
 'S': 12,
 'R': 13,
 'D': 14,
 'L': 15,
 'U': 16,
 'M': 17,
 'W': 18,
 'C': 19,
 'F': 20,
 'G': 21,
 'Y': 22,
 'P': 23,
 'B': 24,
 'V': 25,
 'K': 26,
 "'": 27,
 'X': 28,
 'J': 29,
 'Q': 30,
 'Z': 31}

In [55]:
processor.tokenizer.decode(1)

'<s>'

In [60]:
processor.decode([5,10,15,20,1,2,3])

'EILF<s></s><unk>'

In [61]:
transcription

'FORMER U A SPEAKER OF THE HOUSE NEWT GINGRICH CAME IN SECOND WITH THIRTY TWO PER CENT'

In [62]:
ids

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0, 20,  0,  0,  0,  0,  8, 13,  0, 17,  0,  0,
         5, 13,  0,  0,  0,  4,  0,  0,  0, 16, 16,  0,  4,  0,  0,  7,  0,  0,
         0,  0,  4,  4, 12, 12,  0,  0,  0,  0, 23,  0,  5,  5,  0,  7,  0, 26,
         0,  0,  0,  5, 13,  0,  0,  4,  4,  8, 20,  4,  4,  4,  6, 11,  5,  0,
         4,  4, 11, 11,  0,  0,  0,  0,  8, 16,  0,  0,  0,  0, 12,  5,  5,  0,
         4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,  9,  0,  0,  0,  5,
        18,  0,  0,  0,  6,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0, 21,  0,  0,  0,  0, 10,  0,  0,  9,  0,  0,  0, 21,
         0,  0, 13,  0,  0,  0, 10,  0,  0,  0,  0, 19,  0, 11,  0,  0,  0,  4,
         4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 

In [69]:
processor.decode(ids[:50])

'FORMER U'