###### Downloading Necessary Packages

In [1]:
!pip install -qU transformers datasets evaluate jiwer soundfile librosa

###### Setting Up Model Pipeline

In [2]:
# Setting up the Pipeline
from transformers import pipeline
import torch

if torch.cuda.is_available():
    device = "cuda:0"
    torch_dtype = torch.float16
else:
    device = "cpu"
    torch_dtype = torch.float32

pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    torch_dtype=torch_dtype,
    device=device,
)

model.safetensors:  47%|####6     | 451M/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [3]:
device

'cuda:0'

###### Logging In to the Hub

In [4]:
# login to the hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

##### Common Voice

###### Analysis for Common Voice English

In [5]:
# Loading the Dataset
from datasets import load_dataset, Dataset

common_voice_test = load_dataset("mozilla-foundation/common_voice_17_0", "en", split="test", streaming=True, trust_remote_code=True) # Load the "test" split of the Common Voice 17.0 dataset for English
common_voice_test = list(common_voice_test.take(250)) # Select the first 250 samples from the loaded test dataset (as a list)
len(common_voice_test)

Reading metadata...: 16393it [00:01, 14456.44it/s]


250

In [6]:
# Converting list back to Dataset
from datasets import Dataset, Audio
import numpy as np
common_voice_test = Dataset.from_dict({"audio":    np.array([item["audio"] for item in common_voice_test]),
                                       "sentence": [item["sentence"] for item in common_voice_test]})
common_voice_test = common_voice_test.cast_column("audio", Audio())
common_voice_test[0]["audio"]

{'path': None,
 'array': array([0., 0., 0., ..., 0., 0., 0.]),
 'sampling_rate': 48000}

In [7]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions = [] # Initialize an empty list to store predictions

# Run streamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(common_voice_test, "audio"),  # Create a dataset (extracting only "audio" column of oriinal dataset)
        max_new_tokens=128,  # Set the maximum number of tokens to generate
        generate_kwargs={"task": "transcribe"},  # Set the task for the pipeline
        batch_size=1,  # Set the batch size for the pipeline
    ),
    total=len(common_voice_test),  # Set the total number of samples for the progress bar
):
    # Append the generated text from the prediction to the all_predictions list
    all_predictions.append(prediction["text"])

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 250/250 [00:50<00:00,  4.99it/s]


In [8]:
# Calculating the WER (Word Error Rate)
from evaluate import load

wer_metric = load("wer") # Load the WER metric

# Compute the WER for the original (orthographic) predictions and references
wer_ortho = 100 * wer_metric.compute(
    references=common_voice_test["sentence"], predictions=all_predictions
)
# Print the computed orthographic WER
print(f"Wer_Ortho: {wer_ortho}")

Wer_Ortho: 27.29326705829191


In [9]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer() # Initialize the text normalizer
all_predictions_norm = [normalizer(pred) for pred in all_predictions] # Normalizing all predictions
all_references_norm = [normalizer(label) for label in common_voice_test["sentence"]] # Normalizing all references

# Filtering step to only evaluate the samples that correspond to non-zero references
all_predictions_norm = [
    all_predictions_norm[i]
    for i in range(len(all_predictions_norm))
    if len(all_references_norm[i]) > 0  # Only keep predictions where the reference is non-zero
]
all_references_norm = [
    all_references_norm[i]
    for i in range(len(all_references_norm))
    if len(all_references_norm[i]) > 0  # Only keep references that are non-zero
]

# Compute the WER for the normalized predictions and references
wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)
# Print the computed normalized WER
print(f"Wer: {wer}")

Wer: 21.837549933422103


###### Analysis for Common Voice Yoruba

In [10]:
# Loading the Dataset
from datasets import load_dataset, Dataset

common_voice_test = load_dataset("mozilla-foundation/common_voice_17_0", "yo", split="test", streaming=True, trust_remote_code=True) # Load the "test" split of the Common Voice 17.0 dataset for English
common_voice_test = list(common_voice_test.take(250)) # Select the first 250 samples from the loaded test dataset (as a list)
len(common_voice_test)

Reading metadata...: 999it [00:00, 1540.71it/s]


250

In [11]:
# Converting list back to Dataset
from datasets import Dataset, Audio
import numpy as np
common_voice_test = Dataset.from_dict({"audio":    np.array([item["audio"] for item in common_voice_test]),
                                       "sentence": [item["sentence"] for item in common_voice_test]})
common_voice_test = common_voice_test.cast_column("audio", Audio())
common_voice_test[0]["audio"]

{'path': None,
 'array': array([0.        , 0.        , 0.        , ..., 0.00042725, 0.00042725,
        0.00021362]),
 'sampling_rate': 48000}

In [12]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions = [] # Initialize an empty list to store predictions

# Run streamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(common_voice_test, "audio"),  # Create a dataset (extracting only "audio" column of oriinal dataset)
        max_new_tokens=128,  # Set the maximum number of tokens to generate
        generate_kwargs={"task": "transcribe"},  # Set the task for the pipeline
        batch_size=1,  # Set the batch size for the pipeline
    ),
    total=len(common_voice_test),  # Set the total number of samples for the progress bar
):
    # Append the generated text from the prediction to the all_predictions list
    all_predictions.append(prediction["text"])

100%|██████████| 250/250 [04:16<00:00,  1.03s/it]


In [13]:
# Calculating the WER (Word Error Rate)
from evaluate import load

wer_metric = load("wer") # Load the WER metric

# Compute the WER for the original (orthographic) predictions and references
wer_ortho = 100 * wer_metric.compute(
    references=common_voice_test["sentence"], predictions=all_predictions
)
# Print the computed orthographic WER
print(f"Wer_Ortho: {wer_ortho}")

Wer_Ortho: 164.7036181678214


In [14]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer() # Initialize the text normalizer
all_predictions_norm = [normalizer(pred) for pred in all_predictions] # Normalizing all predictions
all_references_norm = [normalizer(label) for label in common_voice_test["sentence"]] # Normalizing all references

# Filtering step to only evaluate the samples that correspond to non-zero references
all_predictions_norm = [
    all_predictions_norm[i]
    for i in range(len(all_predictions_norm))
    if len(all_references_norm[i]) > 0  # Only keep predictions where the reference is non-zero
]
all_references_norm = [
    all_references_norm[i]
    for i in range(len(all_references_norm))
    if len(all_references_norm[i]) > 0  # Only keep references that are non-zero
]

# Compute the WER for the normalized predictions and references
wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)
# Print the computed normalized WER
print(f"Wer: {wer}")

Wer: 154.09724583475008


##### Google Fluers

###### Analysis for Google Fluers English

In [15]:
# Loading the Dataset
from datasets import load_dataset, Dataset

google_fleurs_test = load_dataset("google/fleurs", "en_us", split="test", streaming=True, trust_remote_code=True) # Load the "test" split of the Common Voice 17.0 dataset for English
google_fleurs_test = list(google_fleurs_test.take(250)) # Select the first 250 samples from the loaded test dataset (as a list)
len(google_fleurs_test)

250

In [16]:
# Converting list back to Dataset
from datasets import Dataset, Audio
import numpy as np

google_fleurs_test = Dataset.from_dict({"audio":    np.array([item["audio"] for item in google_fleurs_test]),
                                       "sentence": [item["transcription"] for item in google_fleurs_test]})
google_fleurs_test = google_fleurs_test.cast_column("audio", Audio())
google_fleurs_test[0]["audio"]

{'path': None,
 'array': array([ 0.00000000e+00, -3.05175781e-05,  0.00000000e+00, ...,
        -1.03759766e-03, -9.76562500e-04, -1.25122070e-03]),
 'sampling_rate': 16000}

In [17]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions = [] # Initialize an empty list to store predictions

# Run streamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(google_fleurs_test, "audio"),  # Create a dataset (extracting only "audio" column of oriinal dataset)
        max_new_tokens=128,  # Set the maximum number of tokens to generate
        generate_kwargs={"task": "transcribe"},  # Set the task for the pipeline
        batch_size=1,  # Set the batch size for the pipeline
    ),
    total=len(google_fleurs_test),  # Set the total number of samples for the progress bar
):
    # Append the generated text from the prediction to the all_predictions list
    all_predictions.append(prediction["text"])

100%|██████████| 250/250 [01:24<00:00,  2.97it/s]


In [18]:
# Calculating the WER (Word Error Rate)
from evaluate import load

wer_metric = load("wer") # Load the WER metric

# Compute the WER for the original (orthographic) predictions and references
wer_ortho = 100 * wer_metric.compute(
    references=google_fleurs_test["sentence"], predictions=all_predictions
)
# Print the computed orthographic WER
print(f"Wer_Ortho: {wer_ortho}")

Wer_Ortho: 25.668246445497626


In [19]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer() # Initialize the text normalizer
all_predictions_norm = [normalizer(pred) for pred in all_predictions] # Normalizing all predictions
all_references_norm = [normalizer(label) for label in google_fleurs_test["sentence"]] # Normalizing all references

# Filtering step to only evaluate the samples that correspond to non-zero references
all_predictions_norm = [
    all_predictions_norm[i]
    for i in range(len(all_predictions_norm))
    if len(all_references_norm[i]) > 0  # Only keep predictions where the reference is non-zero
]
all_references_norm = [
    all_references_norm[i]
    for i in range(len(all_references_norm))
    if len(all_references_norm[i]) > 0  # Only keep references that are non-zero
]

# Compute the WER for the normalized predictions and references
wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)
# Print the computed normalized WER
print(f"Wer: {wer}")

Wer: 6.468498784819593


###### Analysis for Google Fluers Yoruba

In [20]:
# Loading the Dataset
from datasets import load_dataset, Dataset

google_fleurs_test = load_dataset("google/fleurs", "yo_ng", split="test", streaming=True, trust_remote_code=True) # Load the "test" split of the Common Voice 17.0 dataset for English
google_fleurs_test = list(google_fleurs_test.take(250)) # Select the first 250 samples from the loaded test dataset (as a list)
len(google_fleurs_test)

250

In [21]:
# Converting list back to Dataset
from datasets import Dataset, Audio
import numpy as np

google_fleurs_test = Dataset.from_dict({"audio":    np.array([item["audio"] for item in google_fleurs_test]),
                                       "sentence": [item["transcription"] for item in google_fleurs_test]})
google_fleurs_test = google_fleurs_test.cast_column("audio", Audio())
google_fleurs_test[0]["audio"]

{'path': None,
 'array': array([0.        , 0.        , 0.        , ..., 0.00042725, 0.00036621,
        0.00021362]),
 'sampling_rate': 16000}

In [22]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions = [] # Initialize an empty list to store predictions

# Run streamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(google_fleurs_test, "audio"),  # Create a dataset (extracting only "audio" column of oriinal dataset)
        max_new_tokens=128,  # Set the maximum number of tokens to generate
        generate_kwargs={"task": "transcribe"},  # Set the task for the pipeline
        batch_size=1,  # Set the batch size for the pipeline
    ),
    total=len(google_fleurs_test),  # Set the total number of samples for the progress bar
):
    # Append the generated text from the prediction to the all_predictions list
    all_predictions.append(prediction["text"])

100%|██████████| 250/250 [04:31<00:00,  1.09s/it]


In [23]:
# Calculating the WER (Word Error Rate)
from evaluate import load

wer_metric = load("wer") # Load the WER metric

# Compute the WER for the original (orthographic) predictions and references
wer_ortho = 100 * wer_metric.compute(
    references=google_fleurs_test["sentence"], predictions=all_predictions
)
# Print the computed orthographic WER
print(f"Wer_Ortho: {wer_ortho}")

Wer_Ortho: 108.37768240343348


In [24]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer() # Initialize the text normalizer
all_predictions_norm = [normalizer(pred) for pred in all_predictions] # Normalizing all predictions
all_references_norm = [normalizer(label) for label in google_fleurs_test["sentence"]] # Normalizing all references

# Filtering step to only evaluate the samples that correspond to non-zero references
all_predictions_norm = [
    all_predictions_norm[i]
    for i in range(len(all_predictions_norm))
    if len(all_references_norm[i]) > 0  # Only keep predictions where the reference is non-zero
]
all_references_norm = [
    all_references_norm[i]
    for i in range(len(all_references_norm))
    if len(all_references_norm[i]) > 0  # Only keep references that are non-zero
]

# Compute the WER for the normalized predictions and references
wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)
# Print the computed normalized WER
print(f"Wer: {wer}")

Wer: 106.57174151150055


##### TED-LIUM Dataset

In [25]:
# Loading the Dataset
from datasets import load_dataset, Dataset

tedlium_test = load_dataset("LIUM/tedlium", "release3", split="train", streaming=True, trust_remote_code=True) # Load the "test" split of the TEDLIUM dataset for English
tedlium_test = list(tedlium_test.take(250)) # Select the first 250 samples from the loaded test dataset (as a list)
len(tedlium_test)

250

In [26]:
# Converting list back to Dataset
from datasets import Dataset, Audio
import numpy as np

tedlium_test = Dataset.from_dict({"audio":    np.array([item["audio"] for item in tedlium_test]),
                                       "sentence": [item["text"] for item in tedlium_test]})
tedlium_test = tedlium_test.cast_column("audio", Audio())
tedlium_test[0]["audio"]

{'path': None,
 'array': array([ 0.00085449,  0.00082397,  0.00024414, ..., -0.00164795,
        -0.00177002, -0.00076294]),
 'sampling_rate': 16000}

In [27]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions = [] # Initialize an empty list to store predictions

# Run streamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(tedlium_test, "audio"),  # Create a dataset (extracting only "audio" column of oriinal dataset)
        max_new_tokens=128,  # Set the maximum number of tokens to generate
        generate_kwargs={"task": "transcribe"},  # Set the task for the pipeline
        batch_size=1,  # Set the batch size for the pipeline
    ),
    total=len(tedlium_test),  # Set the total number of samples for the progress bar
):
    # Append the generated text from the prediction to the all_predictions list
    all_predictions.append(prediction["text"])

100%|██████████| 250/250 [01:04<00:00,  3.86it/s]


In [28]:
# Calculating the WER (Word Error Rate)
from evaluate import load

wer_metric = load("wer") # Load the WER metric

# Compute the WER for the original (orthographic) predictions and references
wer_ortho = 100 * wer_metric.compute(
    references=tedlium_test["sentence"], predictions=all_predictions
)
# Print the computed orthographic WER
print(f"Wer_Ortho: {wer_ortho}")

Wer_Ortho: 36.20080131982088


In [29]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer() # Initialize the text normalizer
all_predictions_norm = [normalizer(pred) for pred in all_predictions] # Normalizing all predictions
all_references_norm = [normalizer(label) for label in tedlium_test["sentence"]] # Normalizing all references

# Filtering step to only evaluate the samples that correspond to non-zero references
all_predictions_norm = [
    all_predictions_norm[i]
    for i in range(len(all_predictions_norm))
    if len(all_references_norm[i]) > 0  # Only keep predictions where the reference is non-zero
]
all_references_norm = [
    all_references_norm[i]
    for i in range(len(all_references_norm))
    if len(all_references_norm[i]) > 0  # Only keep references that are non-zero
]

# Compute the WER for the normalized predictions and references
wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)
# Print the computed normalized WER
print(f"Wer: {wer}")

Wer: 6.956521739130435


##### LibriSpeech Dataset

In [30]:
# Loading the Dataset
from datasets import load_dataset, Dataset

libris_speech_test = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True) # Load the "test" split of the TEDLIUM dataset for English
libris_speech_test = list(libris_speech_test.take(250)) # Select the first 250 samples from the loaded test dataset (as a list)
len(libris_speech_test)

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

250

In [31]:
# Converting list back to Dataset
from datasets import Dataset, Audio
import numpy as np

libris_speech_test = Dataset.from_dict({"audio":    np.array([item["audio"] for item in libris_speech_test]),
                                       "sentence": [item["text"] for item in libris_speech_test]})
libris_speech_test = libris_speech_test.cast_column("audio", Audio())
libris_speech_test[0]["audio"]

{'path': None,
 'array': array([ 0.00186157,  0.0005188 ,  0.00024414, ..., -0.00097656,
        -0.00109863, -0.00146484]),
 'sampling_rate': 16000}

In [32]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions = [] # Initialize an empty list to store predictions

# Run streamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(libris_speech_test, "audio"),  # Create a dataset (extracting only "audio" column of oriinal dataset)
        max_new_tokens=128,  # Set the maximum number of tokens to generate
        generate_kwargs={"task": "transcribe"},  # Set the task for the pipeline
        batch_size=1,  # Set the batch size for the pipeline
    ),
    total=len(libris_speech_test),  # Set the total number of samples for the progress bar
):
    # Append the generated text from the prediction to the all_predictions list
    all_predictions.append(prediction["text"])

100%|██████████| 250/250 [01:17<00:00,  3.21it/s]


In [33]:
# Calculating the WER (Word Error Rate)
from evaluate import load

wer_metric = load("wer") # Load the WER metric

# Compute the WER for the original (orthographic) predictions and references
wer_ortho = 100 * wer_metric.compute(
    references=libris_speech_test["sentence"], predictions=all_predictions
)
# Print the computed orthographic WER
print(f"Wer_Ortho: {wer_ortho}")

Wer_Ortho: 99.49632738719832


In [34]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer() # Initialize the text normalizer
all_predictions_norm = [normalizer(pred) for pred in all_predictions] # Normalizing all predictions
all_references_norm = [normalizer(label) for label in libris_speech_test["sentence"]] # Normalizing all references

# Filtering step to only evaluate the samples that correspond to non-zero references
all_predictions_norm = [
    all_predictions_norm[i]
    for i in range(len(all_predictions_norm))
    if len(all_references_norm[i]) > 0  # Only keep predictions where the reference is non-zero
]
all_references_norm = [
    all_references_norm[i]
    for i in range(len(all_references_norm))
    if len(all_references_norm[i]) > 0  # Only keep references that are non-zero
]

# Compute the WER for the normalized predictions and references
wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)
# Print the computed normalized WER
print(f"Wer: {wer}")

Wer: 3.6511579386605466


##### Miscellaneous Datasets

###### Analysis for OpenSLR Yoruba

In [35]:
# Loading the Dataset
from datasets import load_dataset, Dataset

openslr_test = load_dataset("babs/openslr-yoruba", split="train", streaming=True, trust_remote_code=True) # Load the "test" split of the Common Voice 17.0 dataset for English
openslr_test = list(openslr_test.take(250)) # Select the first 250 samples from the loaded test dataset (as a list)
len(openslr_test)

250

In [36]:
# Converting list back to Dataset
from datasets import Dataset, Audio
import numpy as np

openslr_test = Dataset.from_dict({"audio":    np.array([item["audio"] for item in openslr_test]),
                                       "sentence": [item["transcription"] for item in openslr_test]})
openslr_test = openslr_test.cast_column("audio", Audio())
openslr_test[0]["audio"]

{'path': None,
 'array': array([0., 0., 0., ..., 0., 0., 0.]),
 'sampling_rate': 48000}

In [37]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions = [] # Initialize an empty list to store predictions

# Run streamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(openslr_test, "audio"),  # Create a dataset (extracting only "audio" column of oriinal dataset)
        max_new_tokens=128,  # Set the maximum number of tokens to generate
        generate_kwargs={"task": "transcribe"},  # Set the task for the pipeline
        batch_size=1,  # Set the batch size for the pipeline
    ),
    total=len(openslr_test),  # Set the total number of samples for the progress bar
):
    # Append the generated text from the prediction to the all_predictions list
    all_predictions.append(prediction["text"])

100%|██████████| 250/250 [03:30<00:00,  1.19it/s]


In [38]:
# Calculating the WER (Word Error Rate)
from evaluate import load

wer_metric = load("wer") # Load the WER metric

# Compute the WER for the original (orthographic) predictions and references
wer_ortho = 100 * wer_metric.compute(
    references=openslr_test["sentence"], predictions=all_predictions
)
# Print the computed orthographic WER
print(f"Wer_Ortho: {wer_ortho}")

Wer_Ortho: 167.01132447070407


In [39]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer() # Initialize the text normalizer
all_predictions_norm = [normalizer(pred) for pred in all_predictions] # Normalizing all predictions
all_references_norm = [normalizer(label) for label in openslr_test["sentence"]] # Normalizing all references

# Filtering step to only evaluate the samples that correspond to non-zero references
all_predictions_norm = [
    all_predictions_norm[i]
    for i in range(len(all_predictions_norm))
    if len(all_references_norm[i]) > 0  # Only keep predictions where the reference is non-zero
]
all_references_norm = [
    all_references_norm[i]
    for i in range(len(all_references_norm))
    if len(all_references_norm[i]) > 0  # Only keep references that are non-zero
]

# Compute the WER for the normalized predictions and references
wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)
# Print the computed normalized WER
print(f"Wer: {wer}")

Wer: 156.71510347864375


###### Analysis for Yoruba Audio Data

In [40]:
# Loading the Dataset
from datasets import load_dataset, Dataset

openslr_test = load_dataset("odunola/yoruba_audio_data", split="train", streaming=True, trust_remote_code=True) # Load the "test" split of the Common Voice 17.0 dataset for English
openslr_test = list(openslr_test.take(250)) # Select the first 250 samples from the loaded test dataset (as a list)
len(openslr_test)

250

In [41]:
# Converting list back to Dataset
from datasets import Dataset, Audio
import numpy as np

openslr_test = Dataset.from_dict({"audio":    np.array([item["audio"] for item in openslr_test]),
                                       "sentence": [item["sentence"] for item in openslr_test]})
openslr_test = openslr_test.cast_column("audio", Audio())
openslr_test[0]["audio"]

{'path': None,
 'array': array([-1.22070312e-04, -1.83105469e-04, -1.83105469e-04, ...,
        -9.15527344e-05, -9.15527344e-05, -1.22070312e-04]),
 'sampling_rate': 16000}

In [42]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions = [] # Initialize an empty list to store predictions

# Run streamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(openslr_test, "audio"),  # Create a dataset (extracting only "audio" column of oriinal dataset)
        max_new_tokens=128,  # Set the maximum number of tokens to generate
        generate_kwargs={"task": "transcribe"},  # Set the task for the pipeline
        batch_size=1,  # Set the batch size for the pipeline
    ),
    total=len(openslr_test),  # Set the total number of samples for the progress bar
):
    # Append the generated text from the prediction to the all_predictions list
    all_predictions.append(prediction["text"])

100%|██████████| 250/250 [03:42<00:00,  1.13it/s]


In [43]:
# Calculating the WER (Word Error Rate)
from evaluate import load

wer_metric = load("wer") # Load the WER metric

# Compute the WER for the original (orthographic) predictions and references
wer_ortho = 100 * wer_metric.compute(
    references=openslr_test["sentence"], predictions=all_predictions
)
# Print the computed orthographic WER
print(f"Wer_Ortho: {wer_ortho}")

Wer_Ortho: 166.9760625305325


In [44]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer() # Initialize the text normalizer
all_predictions_norm = [normalizer(pred) for pred in all_predictions] # Normalizing all predictions
all_references_norm = [normalizer(label) for label in openslr_test["sentence"]] # Normalizing all references

# Filtering step to only evaluate the samples that correspond to non-zero references
all_predictions_norm = [
    all_predictions_norm[i]
    for i in range(len(all_predictions_norm))
    if len(all_references_norm[i]) > 0  # Only keep predictions where the reference is non-zero
]
all_references_norm = [
    all_references_norm[i]
    for i in range(len(all_references_norm))
    if len(all_references_norm[i]) > 0  # Only keep references that are non-zero
]

# Compute the WER for the normalized predictions and references
wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)
# Print the computed normalized WER
print(f"Wer: {wer}")

Wer: 156.58067337122867


###### Analysis for Yoruba Data

In [45]:
# Loading the Dataset
from datasets import load_dataset, Dataset

openslr_test = load_dataset("babs/yoruba-data", split="train", streaming=True, trust_remote_code=True) # Load the "test" split of the Common Voice 17.0 dataset for English
openslr_test = list(openslr_test.take(250)) # Select the first 250 samples from the loaded test dataset (as a list)
len(openslr_test)

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

250

In [46]:
# Converting list back to Dataset
from datasets import Dataset, Audio
import numpy as np

openslr_test = Dataset.from_dict({"audio":    np.array([item["audio"] for item in openslr_test]),
                                       "sentence": [item["sentence"] for item in openslr_test]})
openslr_test = openslr_test.cast_column("audio", Audio())
openslr_test[0]["audio"]

{'path': None,
 'array': array([ 0.00000000e+00, -4.88281250e-04, -5.79833984e-04, ...,
         9.15527344e-05, -1.22070312e-04, -3.05175781e-04]),
 'sampling_rate': 24000}

In [47]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions = [] # Initialize an empty list to store predictions

# Run streamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(openslr_test, "audio"),  # Create a dataset (extracting only "audio" column of oriinal dataset)
        max_new_tokens=128,  # Set the maximum number of tokens to generate
        generate_kwargs={"task": "transcribe"},  # Set the task for the pipeline
        batch_size=1,  # Set the batch size for the pipeline
    ),
    total=len(openslr_test),  # Set the total number of samples for the progress bar
):
    # Append the generated text from the prediction to the all_predictions list
    all_predictions.append(prediction["text"])

100%|██████████| 250/250 [04:14<00:00,  1.02s/it]


In [48]:
# Calculating the WER (Word Error Rate)
from evaluate import load

wer_metric = load("wer") # Load the WER metric

# Compute the WER for the original (orthographic) predictions and references
wer_ortho = 100 * wer_metric.compute(
    references=openslr_test["sentence"], predictions=all_predictions
)
# Print the computed orthographic WER
print(f"Wer_Ortho: {wer_ortho}")

Wer_Ortho: 121.08133744368035


In [49]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer() # Initialize the text normalizer
all_predictions_norm = [normalizer(pred) for pred in all_predictions] # Normalizing all predictions
all_references_norm = [normalizer(label) for label in openslr_test["sentence"]] # Normalizing all references

# Filtering step to only evaluate the samples that correspond to non-zero references
all_predictions_norm = [
    all_predictions_norm[i]
    for i in range(len(all_predictions_norm))
    if len(all_references_norm[i]) > 0  # Only keep predictions where the reference is non-zero
]
all_references_norm = [
    all_references_norm[i]
    for i in range(len(all_references_norm))
    if len(all_references_norm[i]) > 0  # Only keep references that are non-zero
]

# Compute the WER for the normalized predictions and references
wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)
# Print the computed normalized WER
print(f"Wer: {wer}")

Wer: 118.24048182404819
