In [24]:
import itertools

import pandas as pd
from datasets import load_dataset
from transformers import pipeline

In [2]:
streaming_dataset = load_dataset('parler-tts/mls_eng', split="train", streaming=True)

sample = list(itertools.islice(streaming_dataset, 10))

In [3]:
sample[0]

{'audio': {'path': '4800_10003_000000.opus',
  'array': array([-0.00010165, -0.00013155, -0.00016235, ..., -0.00099044,
         -0.00096123, -0.00088181]),
  'sampling_rate': 16000},
 'original_path': 'http://www.archive.org/download/rose_garden_husband_1508_librivox/rose_garden_husband_05_widdemer_64kb.mp3',
 'begin_time': 401.76,
 'end_time': 417.57,
 'transcript': "oh my dear you must see him he expects you she answered almost gayly the procession of three moved down the long room towards a door phyllis's hand guiding the wheel-chair",
 'audio_duration': 15.810000000000002,
 'speaker_id': '4800',
 'book_id': '10003'}

In [5]:
asr = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-large-v3-turbo"
)

Device set to use mps:0


In [6]:
type(asr)

transformers.pipelines.automatic_speech_recognition.AutomaticSpeechRecognitionPipeline

In [9]:
asr.framework

'pt'

In [21]:
asr.model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bia

In [23]:
sample[0]

{'audio': {'path': '4800_10003_000000.opus',
  'array': array([-0.00010165, -0.00013155, -0.00016235, ..., -0.00099044,
         -0.00096123, -0.00088181]),
  'sampling_rate': 16000},
 'original_path': 'http://www.archive.org/download/rose_garden_husband_1508_librivox/rose_garden_husband_05_widdemer_64kb.mp3',
 'begin_time': 401.76,
 'end_time': 417.57,
 'transcript': "oh my dear you must see him he expects you she answered almost gayly the procession of three moved down the long room towards a door phyllis's hand guiding the wheel-chair",
 'audio_duration': 15.810000000000002,
 'speaker_id': '4800',
 'book_id': '10003'}

In [25]:
def data():
    for i in range(len(sample)):
        yield sample[i]["audio"]["array"], sample[i]["transcript"]

output = []
for audio, transcript in data():
    pred = asr(audio)["text"].lower()

    output.append({"transcript": transcript, "pred": pred})

output_df = pd.DataFrame(output)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


In [27]:
pd.set_option("display.max_colwidth", None)
output_df

Unnamed: 0,transcript,pred
0,oh my dear you must see him he expects you she answered almost gayly the procession of three moved down the long room towards a door phyllis's hand guiding the wheel-chair,oh my dear you must see him he expects you she answered almost gaily the procession of three moved down the long room towards a door phyllis's hand guiding the wheel chair
1,it was quite as much fun well almost as much hearing her as it would have been to play all of the contented and otherwise elderly people who inhabited the boarding-house with phyllis,it was quite as much fun well almost as much hearing her as it would have been to play all of the contented and otherwise elderly people who inhabited the boarding-house with phyllis
2,the man stole out and shut the door softly phyllis herself rose and went toward the window and busied herself in braiding up her hair there was almost silence in the room for a few minutes,the man stole out and shut the door softly phyllis herself rose and went toward the window and busied herself in braiding up her hair there was almost silence in the room for a few minutes
3,has it said phyllis it was like mrs harrington that careful planning of even where she should be put is mr harrington in his day-room now,has it said phyllis it was like mrs harrington that careful planning of even where she should be put is mr harrington in his day-room now
4,and she insisted that the pink paper stay on the electric lights after about a week of this phyllis suddenly remembered that she had not been selfish at all yet,and she insisted that the pink paper stay on the electric lights after about a week of this phyllis suddenly remembered that she had not been selfish at all yet
5,surprise i-i'm glad you like it said his wife shyly still backing away of course he'd like it said mrs de guenther's kind staccato voice behind him kiss your husband and tell him he's welcome home phyllis child,i'm i'm glad you like it said his wife shyly still backing away of course he'd like it said mrs de guenther's kind staccato voice behind him kiss your husband and tell him he's welcome home phyllis child
6,you have everything that could be asked even to a certain cheerfulness of outlook which poor angela naturally lacks in a measure but-but what about me asked phyllis braithwaite a little piteously in answer to all this,you have everything that could be asked even to a certain cheerfulness of outlook which poor angela naturally lacks in a measure but-but what about me asked phyllis braithwaite a little piteously in answer to all this
7,i've bought myself lots of things she defended herself most of this is really for me and-i can't help being good to him it's only common humanity,i've bought myself lots of things she defended herself most of this is really for me and-i can't help being good to him it's only common humanity
8,his little crumpled black muzzle on the pillow close to allan's contented sleeping face she felt as if she wanted to cry the pathetic lack of interests which made the coming of a new little dog such an event,his little crumpled black muzzle on the pillow close to allan's contented sleeping face she felt as if she wanted to cry the pathetic lack of interest which made the coming of a new little dog such an event
9,she wondered afterwards how she could have spoken with that hard serenity how she could have gone steadily on with story after story poem after poem till allan's grip on her hands relaxed and he fell into a heavy tired sleep,she wondered afterwards how she could have spoken with that hard serenity how she could have gone steadily on with story after story poem after poem till allan's grip on her hands relaxed and he fell into a heavy tired sleep


Para evaluar un modelo de Automatic Speech Recognition (ASR) se usa el Word Error Rate (WER)

- Basado en la distancia Leveshtain

- Métrica para medir la diferencia entre dos sequencias

- Rango del 0 asl 1

- Valores pequeños indican mayor similaridad

$$\frac{Substitutions + Insertions + Deletions}{NumberOfWordsSpoken}$$

In [29]:
from evaluate import load

In [31]:
wer = load("wer")

In [34]:
wer = wer.compute(
    predictions=output_df["pred"],
    references=output_df["transcript"]
)

In [36]:
print(f"The WER for the model is {wer:.03f}")

The WER for the model is 0.018
