In [None]:
# Run in virtual environment
# pip install jupyter-lab
# jupyter-lab

In [7]:
!pip install torch accelerate torchaudio datasets[audio]
!pip install --upgrade transformers
!pip install soundfile
!pip install librosa
!pip install jiwer



In [8]:
pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [9]:
from IPython.display import Audio as Play_Audio
from datasets import load_dataset
  
from transformers import Wav2Vec2ForCTC, AutoProcessor
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch

import jiwer

In [10]:
fleurs = load_dataset("google/fleurs", "en_us", split="train", streaming=True)
dataset_iterator = iter(fleurs)

In [11]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

model.safetensors:  86%|########6 | 325M/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
sample = next(dataset_iterator)
data = sample['audio']['array']
sampling_rate = sample['audio']['sampling_rate']

In [15]:
sample

{'id': 722,
 'num_samples': 164160,
 'path': None,
 'audio': {'path': 'train/10035729252730569448.wav',
  'array': array([0.        , 0.        , 0.        , ..., 0.00266123, 0.00290918,
         0.00321454]),
  'sampling_rate': 16000},
 'transcription': 'the island was first inhabited by the taínos and caribes. the caribes were an arawakan-speaking people who had arrived around 10,000 bce',
 'raw_transcription': 'The island was first inhabited by the Taínos and Caribes. The Caribes were an Arawakan-speaking people who had arrived around 10,000 BCE.',
 'gender': 1,
 'lang_id': 19,
 'language': 'English',
 'lang_group_id': 0}

In [16]:
# Play the audio
Play_Audio(data=data, rate=sampling_rate)

In [17]:
inputs = processor(data, sampling_rate=16_000, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs).logits

ids = torch.argmax(outputs, dim=-1)[0]
transcription = processor.decode(ids)

actual = sample['raw_transcription']
prediction =  transcription
metrics = jiwer.compute_measures(actual.lower(), prediction.lower())

In [18]:
actual, prediction

('The island was first inhabited by the Taínos and Caribes. The Caribes were an Arawakan-speaking people who had arrived around 10,000 BCE.',
 'THE ISLAND WAS FIRST INHABITED BY THE TIANOS AND CARIVIS THE CARIVES WERE IN ARAQUAK AND SPEAKING PEOPLE WHO HAD ARRIVED AROUND TEN THOUSAND B C')

In [19]:
actual.lower(), prediction.lower()

('the island was first inhabited by the taínos and caribes. the caribes were an arawakan-speaking people who had arrived around 10,000 bce.',
 'the island was first inhabited by the tianos and carivis the carives were in araquak and speaking people who had arrived around ten thousand b c')

In [20]:
ids.shape

torch.Size([512])

In [21]:
inputs.keys()

dict_keys(['input_values'])

In [22]:
transcription

'THE ISLAND WAS FIRST INHABITED BY THE TIANOS AND CARIVIS THE CARIVES WERE IN ARAQUAK AND SPEAKING PEOPLE WHO HAD ARRIVED AROUND TEN THOUSAND B C'

In [23]:
sample

{'id': 722,
 'num_samples': 164160,
 'path': None,
 'audio': {'path': 'train/10035729252730569448.wav',
  'array': array([0.        , 0.        , 0.        , ..., 0.00266123, 0.00290918,
         0.00321454]),
  'sampling_rate': 16000},
 'transcription': 'the island was first inhabited by the taínos and caribes. the caribes were an arawakan-speaking people who had arrived around 10,000 bce',
 'raw_transcription': 'The island was first inhabited by the Taínos and Caribes. The Caribes were an Arawakan-speaking people who had arrived around 10,000 BCE.',
 'gender': 1,
 'lang_id': 19,
 'language': 'English',
 'lang_group_id': 0}

In [24]:
metrics

{'wer': 0.5,
 'mer': 0.4230769230769231,
 'wil': 0.6066433566433567,
 'wip': 0.3933566433566433,
 'hits': 15,
 'substitutions': 7,
 'deletions': 0,
 'insertions': 4,
 'ops': [[AlignmentChunk(type='equal', ref_start_idx=0, ref_end_idx=7, hyp_start_idx=0, hyp_end_idx=7),
   AlignmentChunk(type='substitute', ref_start_idx=7, ref_end_idx=8, hyp_start_idx=7, hyp_end_idx=8),
   AlignmentChunk(type='equal', ref_start_idx=8, ref_end_idx=9, hyp_start_idx=8, hyp_end_idx=9),
   AlignmentChunk(type='substitute', ref_start_idx=9, ref_end_idx=10, hyp_start_idx=9, hyp_end_idx=10),
   AlignmentChunk(type='equal', ref_start_idx=10, ref_end_idx=11, hyp_start_idx=10, hyp_end_idx=11),
   AlignmentChunk(type='substitute', ref_start_idx=11, ref_end_idx=12, hyp_start_idx=11, hyp_end_idx=12),
   AlignmentChunk(type='equal', ref_start_idx=12, ref_end_idx=13, hyp_start_idx=12, hyp_end_idx=13),
   AlignmentChunk(type='insert', ref_start_idx=13, ref_end_idx=13, hyp_start_idx=13, hyp_end_idx=15),
   AlignmentChunk

In [25]:
outputs

tensor([[[ 13.3458, -29.6154, -29.2445,  ...,  -7.5523,  -8.1111,  -6.6901],
         [ 12.9680, -29.2903, -28.9244,  ...,  -7.3712,  -8.0302,  -6.7725],
         [ 12.9815, -29.2993, -28.9313,  ...,  -7.3688,  -8.0310,  -6.7584],
         ...,
         [ 12.4908, -28.0922, -27.7737,  ...,  -6.3878,  -7.4381,  -5.8884],
         [ 13.1866, -29.9725, -29.6471,  ...,  -7.3754,  -8.0912,  -6.9170],
         [ 13.3954, -30.0570, -29.7159,  ...,  -7.6895,  -8.4598,  -6.9933]]])

In [48]:
ids

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0, 20,  0,  0,  0,  0,  8, 13,  0, 17,  0,  0,
         5, 13,  0,  0,  0,  4,  0,  0,  0, 16, 16,  0,  4,  0,  0,  7,  0,  0,
         0,  0,  4,  4, 12, 12,  0,  0,  0,  0, 23,  0,  5,  5,  0,  7,  0, 26,
         0,  0,  0,  5, 13,  0,  0,  4,  4,  8, 20,  4,  4,  4,  6, 11,  5,  0,
         4,  4, 11, 11,  0,  0,  0,  0,  8, 16,  0,  0,  0,  0, 12,  5,  5,  0,
         4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,  9,  0,  0,  0,  5,
        18,  0,  0,  0,  6,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0, 21,  0,  0,  0,  0, 10,  0,  0,  9,  0,  0,  0, 21,
         0,  0, 13,  0,  0,  0, 10,  0,  0,  0,  0, 19,  0, 11,  0,  0,  0,  4,
         4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 

In [50]:
outputs.shape, ids.shape

(torch.Size([1, 383, 32]), torch.Size([383]))

In [53]:
inputs['input_values'].shape

torch.Size([1, 122880])

In [54]:
processor.tokenizer.get_vocab()

{'<pad>': 0,
 '<s>': 1,
 '</s>': 2,
 '<unk>': 3,
 '|': 4,
 'E': 5,
 'T': 6,
 'A': 7,
 'O': 8,
 'N': 9,
 'I': 10,
 'H': 11,
 'S': 12,
 'R': 13,
 'D': 14,
 'L': 15,
 'U': 16,
 'M': 17,
 'W': 18,
 'C': 19,
 'F': 20,
 'G': 21,
 'Y': 22,
 'P': 23,
 'B': 24,
 'V': 25,
 'K': 26,
 "'": 27,
 'X': 28,
 'J': 29,
 'Q': 30,
 'Z': 31}

In [55]:
processor.tokenizer.decode(1)

'<s>'

In [60]:
processor.decode([5,10,15,20,1,2,3])

'EILF<s></s><unk>'

In [61]:
transcription

'FORMER U A SPEAKER OF THE HOUSE NEWT GINGRICH CAME IN SECOND WITH THIRTY TWO PER CENT'

In [62]:
ids

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0, 20,  0,  0,  0,  0,  8, 13,  0, 17,  0,  0,
         5, 13,  0,  0,  0,  4,  0,  0,  0, 16, 16,  0,  4,  0,  0,  7,  0,  0,
         0,  0,  4,  4, 12, 12,  0,  0,  0,  0, 23,  0,  5,  5,  0,  7,  0, 26,
         0,  0,  0,  5, 13,  0,  0,  4,  4,  8, 20,  4,  4,  4,  6, 11,  5,  0,
         4,  4, 11, 11,  0,  0,  0,  0,  8, 16,  0,  0,  0,  0, 12,  5,  5,  0,
         4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,  9,  0,  0,  0,  5,
        18,  0,  0,  0,  6,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0, 21,  0,  0,  0,  0, 10,  0,  0,  9,  0,  0,  0, 21,
         0,  0, 13,  0,  0,  0, 10,  0,  0,  0,  0, 19,  0, 11,  0,  0,  0,  4,
         4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 

In [69]:
processor.decode(ids[:50])

'FORMER U'