In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Jul 30 09:01:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   48C    P8             12W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install transformers datasets==3.6.0 torch evaluate jiwer

Collecting datasets==3.6.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metad

# Eval Utils

In [3]:
import os
import glob
import json

import evaluate
import pandas as pd
from collections import defaultdict


def read_manifest(manifest_path: str):
    """
    Reads a manifest file (jsonl format) and returns a list of dictionaries containing samples.
    """
    data = []
    with open(manifest_path, "r", encoding="utf-8") as f:
        for line in f:
            if len(line) > 0:
                datum = json.loads(line)
                data.append(datum)
    return data


def write_manifest(
    references: list,
    transcriptions: list,
    model_id: str,
    dataset_path: str,
    dataset_name: str,
    audio_length: list = None,
    transcription_time: list = None,
):
    """
    Writes a manifest file (jsonl format) and returns the path to the file.

    Args:
        references: Ground truth reference texts.
        transcriptions: Model predicted transcriptions.
        model_id: String identifier for the model.
        dataset_path: Path to the dataset.
        dataset_name: Name of the dataset.
        audio_length: Length of each audio sample in seconds.
        transcription_time: Transcription time of each sample in seconds.

    Returns:
        Path to the manifest file.
    """
    model_id = model_id.replace("/", "-")
    dataset_path = dataset_path.replace("/", "-")
    dataset_name = dataset_name.replace("/", "-")

    if len(references) != len(transcriptions):
        raise ValueError(
            f"The number of samples in `references` ({len(references)}) "
            f"must match `transcriptions` ({len(transcriptions)})."
        )

    if audio_length is not None and len(audio_length) != len(references):
        raise ValueError(
            f"The number of samples in `audio_length` ({len(audio_length)}) "
            f"must match `references` ({len(references)})."
        )
    if transcription_time is not None and len(transcription_time) != len(references):
        raise ValueError(
            f"The number of samples in `transcription_time` ({len(transcription_time)}) "
            f"must match `references` ({len(references)})."
        )

    audio_length = (
        audio_length if audio_length is not None else len(references) * [None]
    )
    transcription_time = (
        transcription_time
        if transcription_time is not None
        else len(references) * [None]
    )

    basedir = "./results/"
    if not os.path.exists(basedir):
        os.makedirs(basedir)

    manifest_path = os.path.join(
        basedir, f"MODEL_{model_id}_DATASET_{dataset_path}_{dataset_name}.jsonl"
    )

    with open(manifest_path, "w", encoding="utf-8") as f:
        for idx, (text, transcript, audio_length, transcription_time) in enumerate(
            zip(references, transcriptions, audio_length, transcription_time)
        ):
            datum = {
                "audio_filepath": f"sample_{idx}",  # dummy value for Speech Data Processor
                "duration": audio_length,
                "time": transcription_time,
                "text": text,
                "pred_text": transcript,
            }
            f.write(f"{json.dumps(datum, ensure_ascii=False)}\n")
    return manifest_path


def score_results(directory: str, model_id: str = None):
    """
    Scores all result files in a directory and returns a composite score over all evaluated datasets.

    Args:
        directory: Path to the result directory, containing one or more jsonl files.
        model_id: Optional, model name to filter out result files based on model name.

    Returns:
        Composite score over all evaluated datasets and a dictionary of all results.
    """

    # Strip trailing slash
    if directory.endswith(os.pathsep):
        directory = directory[:-1]

    # Find all result files in the directory
    result_files = list(glob.glob(f"{directory}/**/*.jsonl", recursive=True))
    result_files = list(sorted(result_files))

    # Filter files belonging to a specific model id
    if model_id is not None and model_id != "":
        print("Filtering models by id:", model_id)
        model_id = model_id.replace("/", "-")
        result_files = [fp for fp in result_files if model_id in fp]

    # Check if any result files were found
    if len(result_files) == 0:
        raise ValueError(f"No result files found in {directory}")

    # Utility function to parse the file path and extract model id, dataset path, dataset name and split
    def parse_filepath(fp: str):
        model_index = fp.find("MODEL_")
        fp = fp[model_index:]
        ds_index = fp.find("DATASET_")
        model_id = fp[:ds_index].replace("MODEL_", "").rstrip("_")
        author_index = model_id.find("-")
        model_id = model_id[:author_index] + "/" + model_id[author_index + 1 :]

        ds_fp = fp[ds_index:]
        dataset_id = ds_fp.replace("DATASET_", "").rstrip(".jsonl")
        return model_id, dataset_id

    # Compute WER results per dataset, and RTFx over all datasets
    results = {}
    wer_metric = evaluate.load("wer")

    for result_file in result_files:
        manifest = read_manifest(result_file)
        model_id_of_file, dataset_id = parse_filepath(result_file)

        references = [datum["text"] for datum in manifest]
        predictions = [datum["pred_text"] for datum in manifest]

        time = [datum["time"] for datum in manifest]
        duration = [datum["duration"] for datum in manifest]
        compute_rtfx = all(time) and all(duration)

        wer = wer_metric.compute(references=references, predictions=predictions)
        wer = round(100 * wer, 2)

        if compute_rtfx:
            audio_length = sum(duration)
            inference_time = sum(time)
            rtfx = round(sum(duration) / sum(time), 4)
        else:
            audio_length = inference_time = rtfx = None

        result_key = f"{model_id_of_file} | {dataset_id}"
        results[result_key] = {"wer": wer, "audio_length": audio_length, "inference_time": inference_time, "rtfx": rtfx}

    print("*" * 80)
    print("Results per dataset:")
    print("*" * 80)

    for k, v in results.items():
        metrics = f"{k}: WER = {v['wer']:0.2f} %"
        if v["rtfx"] is not None:
            metrics += f", RTFx = {v['rtfx']:0.2f}"
        print(metrics)

    # composite WER should be computed over all datasets and with the same key
    composite_wer = defaultdict(float)
    composite_audio_length = defaultdict(float)
    composite_inference_time = defaultdict(float)
    count_entries = defaultdict(int)
    for k, v in results.items():
        key = k.split("|")[0].strip()
        composite_wer[key] += v["wer"]
        if v["rtfx"] is not None:
            composite_audio_length[key] += v["audio_length"]
            composite_inference_time[key] += v["inference_time"]
        else:
            composite_audio_length[key] = composite_inference_time[key] = None
        count_entries[key] += 1

    # normalize scores & print
    print()
    print("*" * 80)
    print("Composite Results:")
    print("*" * 80)
    for k, v in composite_wer.items():
        wer = v / count_entries[k]
        print(f"{k}: WER = {wer:0.2f} %")
    for k in composite_audio_length:
        if composite_audio_length[k] is not None:
            rtfx = composite_audio_length[k] / composite_inference_time[k]
            print(f"{k}: RTFx = {rtfx:0.2f}")
    print("*" * 80)
    return composite_wer, results


# Normalizer

In [4]:
# Copyright 2022 The OpenAI team and The HuggingFace Team. All rights reserved.
# Most of the code is copy pasted from the original whisper repository
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import unicodedata
from fractions import Fraction
from typing import Iterator, List, Match, Optional, Union

import regex


# non-ASCII letters that are not separated by "NFKD" normalization
ADDITIONAL_DIACRITICS = {
    "œ": "oe",
    "Œ": "OE",
    "ø": "o",
    "Ø": "O",
    "æ": "ae",
    "Æ": "AE",
    "ß": "ss",
    "ẞ": "SS",
    "đ": "d",
    "Đ": "D",
    "ð": "d",
    "Ð": "D",
    "þ": "th",
    "Þ": "th",
    "ł": "l",
    "Ł": "L",
}


def remove_symbols_and_diacritics(s: str, keep=""):
    """
    Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some
    manual mappings)
    """

    def replace_character(char):
        if char in keep:
            return char
        elif char in ADDITIONAL_DIACRITICS:
            return ADDITIONAL_DIACRITICS[char]

        elif unicodedata.category(char) == "Mn":
            return ""

        elif unicodedata.category(char)[0] in "MSP":
            return " "

        return char

    return "".join(replace_character(c) for c in unicodedata.normalize("NFKD", s))


def remove_symbols(s: str):
    """
    Replace any other markers, symbols, punctuations with a space, keeping diacritics
    """
    return "".join(" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s))


class BasicTextNormalizer:
    def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
        self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
        self.split_letters = split_letters

    def __call__(self, s: str):
        s = s.lower()
        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        s = self.clean(s).lower()

        if self.split_letters:
            s = " ".join(regex.findall(r"\X", s, regex.U))

        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space

        return s

# Data Utils

In [5]:
from datasets import load_dataset, Audio

normalizer = BasicTextNormalizer()


def normalize(batch):
    batch["norm_text"] = normalizer(batch["transcript"])
    return batch


def load_data(args):
    dataset = load_dataset(
        args.dataset_path,
        args.dataset,
        token=True,
    )

    return dataset


def prepare_data(dataset):
     # Step 1: Resample audio
    dataset = dataset['train'].cast_column("audio", Audio(sampling_rate=16_000))

    # Step 3: Normalize transcripts (optional)
    dataset = dataset.map(normalize)

    return dataset

# Run Eval

## Imports


In [6]:
import os
import time
from tqdm import tqdm
from transformers import WhisperForConditionalGeneration, WhisperProcessor, Wav2Vec2ForCTC, Wav2Vec2Processor, AutoProcessor
import torch
from torch.nn.attention import sdpa_kernel, SDPBackend
from evaluate import load
from types import SimpleNamespace

In [7]:
# Constants
wer_metric = load("wer")
MIN_DURATION_IN_SECONDS = 3.0

Downloading builder script: 0.00B [00:00, ?B/s]

### WHISPER MODEL EXPERIMENTS

In [None]:
# openai/whisper-small
# openai/whisper-medium
# openai/whisper-large-v2
# openai/whisper-large-v3
# nimamehrafar/whisper-dutch-finetuned
# golesheed/whisper-native-children-5-dutch

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2").to("cuda")
#processor = WhisperProcessor.from_pretrained(args.model_id, torch_dtype=torch.bfloat16, attn_implementation="sdpa")
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [None]:
#all_subsets = ["SK-ADHD-C-S", "SK-ADHD-W-S", "SK-TD-C-S", "SK-TD-W-S"]

#for subset in all_subsets:

args = SimpleNamespace(
      model_id="facebook/wav2vec2-large-xlsr-53-dutch",
      dataset_path="bchiusano/AllAsymmetriesCHILDES",
      dataset="SK-ADHD-C-S",
      batch_size=16,
      torch_compile=True,
      compile_mode="max-autotune",
      streaming=False,
  )

# Load data
#dataset = load_data(args)
#dataset = prepare_data(dataset)

### FACEBOOK WAV2VEC

In [None]:
# Model: facebook/wav2vec2-large-xlsr-53-dutch
model = Wav2Vec2ForCTC.from_pretrained(args.model_id).to("cuda")
processor = Wav2Vec2Processor.from_pretrained(args.model_id, torch_dtype=torch.bfloat16, attn_implementation="sdpa")

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/470 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

### FACEBOOK MMSB

In [None]:
# Model: facebook/mms-1b-all
processor = AutoProcessor.from_pretrained(args.model_id)
model = Wav2Vec2ForCTC.from_pretrained(args.model_id).to("cuda")
# Load Language Adapter
processor.tokenizer.set_target_lang("nld")
model.load_adapter("nld")

### Methods

In [None]:
def is_audio_length_in_range(input_length):
    return input_length > MIN_DURATION_IN_SECONDS

In [8]:

def benchmark(batch):
  # Load audio inputs
  audios = [audio["array"] for audio in batch["audio"]]
  batch["audio_length_s"] = [len(audio["array"]) / 16000 for audio in batch["audio"]]
  minibatch_size = len(audios)

  # START TIMING
  start_time = time.time()

  # 1. Pre-Processing
  # Padding
  padding_size = None
  if minibatch_size != args.batch_size:
      padding_size = args.batch_size - minibatch_size
      padding_audios = [audios[-1] for _ in range(padding_size)]
      audios.extend(padding_audios)

  # Standard Whisper processing: pad audios to 30-seconds and converted to log-mel
  inputs = processor(audios, sampling_rate=16_000, return_tensors="pt")

  # 2. Model Inference
  input_features = inputs.input_features
  attention_mask = inputs.get("attention_mask")

  with torch.no_grad():
      predicted_ids = model.generate(input_features.to("cuda"), task="transcribe", language="nl", attention_mask=attention_mask)

  # Remove the padding
  if padding_size is not None:
    predicted_ids = predicted_ids[:-padding_size, ...]

  # Convert token ids to text transcription
  # DECODE OR BATCH DECODE?
  pred_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)

  # END TIMING
  runtime = time.time() - start_time

  # normalize by minibatch size since we want the per-sample time
  batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size]

  # normalize transcriptions with English normalizer
  batch["predictions"] = [normalizer(pred) for pred in pred_text]
  batch["references"] = batch["norm_text"]
  return batch

### Running the Benchmark and Post-Processing

In [9]:
import gc

In [11]:
#all_subsets = ["CK-TD-C-S", "CK-TD-W-S", "SK-ADHD-C-S", "SK-ADHD-W-S", "SK-TD-C-S", "SK-TD-W-S"]
all_subsets = ["CK-TD-W-S", "SK-ADHD-C-S", "SK-ADHD-W-S", "SK-TD-C-S", "SK-TD-W-S"]

for i in range(0,10):
  model_name = f"golesheed/whisper-native-children-{i}-dutch"
  model = WhisperForConditionalGeneration.from_pretrained(model_name).to("cuda")
  processor = WhisperProcessor.from_pretrained(model_name)

  for subset in all_subsets:
    print(f"processing {model_name} for subset {subset}")
    args = SimpleNamespace(
          model_id= model_name,
          dataset_path="bchiusano/AllAsymmetriesCHILDES",
          dataset=subset,
          batch_size=16,
          torch_compile=True,
          compile_mode="max-autotune",
          streaming=False,
      )

    # Load data
    dataset = load_data(args)
    dataset = prepare_data(dataset)

    results = dataset.map(
        benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"],
    )

    # Post-processing - delete weird results
    del_idx = []
    for i in range(len(results)):
        reference = results[i]['references'].split()
        prediction = results[i]['predictions'].split()

        if len(prediction) > 2 * len(reference):
            del_idx.append(i)

    results = results.select(
        (
            i for i in range(len(results))
            if i not in set(del_idx)
        )
    )

    all_results = {
        "audio_length_s": [],
        "transcription_time_s": [],
        "predictions": [],
        "references": [],
    }
    result_iter = iter(results)
    for result in tqdm(result_iter, desc="Samples..."):
        for key in all_results:
            all_results[key].append(result[key])

    # Write manifest results (WER and RTFX)
    manifest_path = write_manifest(
        all_results["references"],
        all_results["predictions"],
        args.model_id,
        args.dataset_path,
        args.dataset,
        audio_length=all_results["audio_length_s"],
        transcription_time=all_results["transcription_time_s"],
    )
    print("Results saved at path:", os.path.abspath(manifest_path))

    wer = wer_metric.compute(
        references=all_results["references"], predictions=all_results["predictions"]
    )
    wer = round(100 * wer, 2)
    rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2)
    print("WER:", wer, "%", "RTFx:", rtfx)

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

processing golesheed/whisper-native-children-0-dutch for subset CK-TD-W-S


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/525 [00:00<?, ? examples/s]

Map:   0%|          | 0/525 [00:00<?, ? examples/s]

Map:   0%|          | 0/525 [00:00<?, ? examples/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Samples...: 509it [00:00, 9463.13it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-0-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_CK-TD-W-S.jsonl
WER: 67.81 % RTFx: 4.05
processing golesheed/whisper-native-children-0-dutch for subset SK-ADHD-C-S


train-00000-of-00001.parquet:   0%|          | 0.00/37.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/902 [00:00<?, ? examples/s]

Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 881it [00:00, 10090.17it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-0-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-C-S.jsonl
WER: 52.43 % RTFx: 7.52
processing golesheed/whisper-native-children-0-dutch for subset SK-ADHD-W-S


train-00000-of-00001.parquet:   0%|          | 0.00/37.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/902 [00:00<?, ? examples/s]

Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 881it [00:00, 9923.87it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-0-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-W-S.jsonl
WER: 53.54 % RTFx: 7.52
processing golesheed/whisper-native-children-0-dutch for subset SK-TD-C-S


train-00000-of-00001.parquet:   0%|          | 0.00/28.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/859 [00:00<?, ? examples/s]

Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 845it [00:00, 8559.23it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-0-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-C-S.jsonl
WER: 45.97 % RTFx: 6.28
processing golesheed/whisper-native-children-0-dutch for subset SK-TD-W-S


train-00000-of-00001.parquet:   0%|          | 0.00/28.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/859 [00:00<?, ? examples/s]

Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 845it [00:00, 9849.23it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-0-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-W-S.jsonl
WER: 47.01 % RTFx: 6.28


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

processing golesheed/whisper-native-children-1-dutch for subset CK-TD-W-S


Map:   0%|          | 0/525 [00:00<?, ? examples/s]

Samples...: 508it [00:00, 9974.98it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-1-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_CK-TD-W-S.jsonl
WER: 69.37 % RTFx: 7.14
processing golesheed/whisper-native-children-1-dutch for subset SK-ADHD-C-S


Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 879it [00:00, 9125.66it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-1-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-C-S.jsonl
WER: 54.79 % RTFx: 5.99
processing golesheed/whisper-native-children-1-dutch for subset SK-ADHD-W-S


Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 879it [00:00, 9684.04it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-1-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-W-S.jsonl
WER: 55.57 % RTFx: 5.99
processing golesheed/whisper-native-children-1-dutch for subset SK-TD-C-S


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 844it [00:00, 9865.73it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-1-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-C-S.jsonl
WER: 47.72 % RTFx: 6.79
processing golesheed/whisper-native-children-1-dutch for subset SK-TD-W-S


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 844it [00:00, 9405.47it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-1-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-W-S.jsonl
WER: 48.7 % RTFx: 6.79


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

processing golesheed/whisper-native-children-2-dutch for subset CK-TD-W-S


Map:   0%|          | 0/525 [00:00<?, ? examples/s]

Samples...: 508it [00:00, 9987.51it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-2-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_CK-TD-W-S.jsonl
WER: 69.88 % RTFx: 5.46
processing golesheed/whisper-native-children-2-dutch for subset SK-ADHD-C-S


Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 879it [00:00, 9758.09it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-2-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-C-S.jsonl
WER: 55.64 % RTFx: 6.18
processing golesheed/whisper-native-children-2-dutch for subset SK-ADHD-W-S


Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 879it [00:00, 9938.76it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-2-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-W-S.jsonl
WER: 56.58 % RTFx: 6.18
processing golesheed/whisper-native-children-2-dutch for subset SK-TD-C-S


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 847it [00:00, 9900.97it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-2-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-C-S.jsonl
WER: 47.97 % RTFx: 6.77
processing golesheed/whisper-native-children-2-dutch for subset SK-TD-W-S


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 847it [00:00, 9747.66it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-2-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-W-S.jsonl
WER: 49.03 % RTFx: 6.77


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

processing golesheed/whisper-native-children-3-dutch for subset CK-TD-W-S


Map:   0%|          | 0/525 [00:00<?, ? examples/s]

Samples...: 503it [00:00, 9591.19it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-3-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_CK-TD-W-S.jsonl
WER: 70.08 % RTFx: 6.01
processing golesheed/whisper-native-children-3-dutch for subset SK-ADHD-C-S


Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 880it [00:00, 9692.41it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-3-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-C-S.jsonl
WER: 56.16 % RTFx: 6.6
processing golesheed/whisper-native-children-3-dutch for subset SK-ADHD-W-S


Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 880it [00:00, 9678.46it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-3-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-W-S.jsonl
WER: 56.95 % RTFx: 6.6
processing golesheed/whisper-native-children-3-dutch for subset SK-TD-C-S


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 845it [00:00, 9732.55it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-3-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-C-S.jsonl
WER: 47.81 % RTFx: 7.12
processing golesheed/whisper-native-children-3-dutch for subset SK-TD-W-S


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 845it [00:00, 8989.15it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-3-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-W-S.jsonl
WER: 48.92 % RTFx: 7.13


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

processing golesheed/whisper-native-children-4-dutch for subset CK-TD-W-S


Map:   0%|          | 0/525 [00:00<?, ? examples/s]

Samples...: 510it [00:00, 9943.08it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-4-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_CK-TD-W-S.jsonl
WER: 70.26 % RTFx: 6.42
processing golesheed/whisper-native-children-4-dutch for subset SK-ADHD-C-S


Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 884it [00:00, 9342.00it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-4-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-C-S.jsonl
WER: 55.15 % RTFx: 7.11
processing golesheed/whisper-native-children-4-dutch for subset SK-ADHD-W-S


Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 884it [00:00, 9699.18it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-4-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-W-S.jsonl
WER: 56.03 % RTFx: 7.1
processing golesheed/whisper-native-children-4-dutch for subset SK-TD-C-S


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 843it [00:00, 9844.71it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-4-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-C-S.jsonl
WER: 46.65 % RTFx: 6.63
processing golesheed/whisper-native-children-4-dutch for subset SK-TD-W-S


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 843it [00:00, 9775.63it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-4-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-W-S.jsonl
WER: 47.82 % RTFx: 6.63


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

processing golesheed/whisper-native-children-5-dutch for subset CK-TD-W-S


Map:   0%|          | 0/525 [00:00<?, ? examples/s]

Samples...: 508it [00:00, 9993.84it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-5-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_CK-TD-W-S.jsonl
WER: 68.25 % RTFx: 8.07
processing golesheed/whisper-native-children-5-dutch for subset SK-ADHD-C-S


Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 882it [00:00, 9493.17it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-5-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-C-S.jsonl
WER: 55.52 % RTFx: 8.11
processing golesheed/whisper-native-children-5-dutch for subset SK-ADHD-W-S


Map:   0%|          | 0/902 [00:00<?, ? examples/s]

Samples...: 882it [00:00, 9906.58it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-5-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-ADHD-W-S.jsonl
WER: 56.44 % RTFx: 8.12
processing golesheed/whisper-native-children-5-dutch for subset SK-TD-C-S


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 847it [00:00, 9849.61it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-5-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-C-S.jsonl
WER: 47.74 % RTFx: 8.3
processing golesheed/whisper-native-children-5-dutch for subset SK-TD-W-S


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Samples...: 847it [00:00, 9461.65it/s]


Results saved at path: /content/results/MODEL_golesheed-whisper-native-children-5-dutch_DATASET_bchiusano-AllAsymmetriesCHILDES_SK-TD-W-S.jsonl
WER: 48.87 % RTFx: 8.3


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

processing golesheed/whisper-native-children-6-dutch for subset CK-TD-W-S


Map:   0%|          | 0/525 [00:00<?, ? examples/s]

KeyboardInterrupt: 

# Second Benchmark Method (Currently not in use)

In [None]:
if args.torch_compile:
    model.forward = torch.compile(model.forward, mode=args.compile_mode, fullgraph=True)
    if model.can_generate():
        # enable static k/v cache for autoregressive models
        model.generation_config.cache_implementation = "static"

In [None]:
def benchmark(batch):
  # Load audio inputs
  audios = [audio["array"] for audio in batch["audio"]]
  batch["audio_length_s"] = [len(audio["array"]) / 16000 for audio in batch["audio"]]
  minibatch_size = len(audios)

  # START TIMING
  start_time = time.time()

  # 1. Pre-Processing
  # Padding
  padding_size = None
  if minibatch_size != args.batch_size and args.torch_compile:
      padding_size = args.batch_size - minibatch_size
      padding_audios = [audios[-1] for _ in range(padding_size)]
      audios.extend(padding_audios)

  if not model.can_generate(): #or len(audios[0]) > processor.feature_extractor.n_samples:
    # 1.2 Either CTC pre-processing (normalize to mean 0, std 1), or long-form Whisper processing
    inputs = processor(
        audios,
        sampling_rate=16_000,
        truncation=False,
        padding="longest",
        return_tensors="pt",
        return_attention_mask=True,
    )

  else:
      # 1.3 Standard Whisper processing: pad audios to 30-seconds and converted to log-mel
      inputs = processor(audios, sampling_rate=16_000, return_tensors="pt", device="cuda")

  inputs = inputs.to("cuda")

  # 2. Model Inference
  with sdpa_kernel(SDPBackend.MATH if args.torch_compile else SDPBackend.FLASH_ATTENTION):
      if model.can_generate():
          # 2.1 Auto-regressive generation for encoder-decoder models
          predicted_ids = model.generate(**inputs, task="transcribe", language="nl")
      else:
          # 2.2. Single forward pass for CTC
          with torch.no_grad():
              logits = model(**inputs).logits
              predicted_ids = logits.argmax(-1)

  # Remove the padding
  if padding_size is not None:
    predicted_ids = predicted_ids[:-padding_size, ...]

  # Convert token ids to text transcription
  # DECODE OR BATCH DECODE?
  pred_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)

  # END TIMING
  runtime = time.time() - start_time

  # normalize by minibatch size since we want the per-sample time
  batch["transcription_time_s"] = minibatch_size * [runtime / minibatch_size]

  # normalize transcriptions with English normalizer
  batch["predictions"] = [normalizer(pred) for pred in pred_text]
  batch["references"] = batch["norm_text"]
  return batch

In [None]:
import gc

In [None]:
all_subsets = ["CK-TD-W-S"]

for subset in all_subsets:

  args = SimpleNamespace(
        model_id="facebook/wav2vec2-large-xlsr-53-dutch",
        dataset_path="bchiusano/AllAsymmetriesCHILDES",
        dataset=subset,
        batch_size=16,
        torch_compile=True,
        compile_mode="max-autotune",
        streaming=False,
    )

  # Load data
  dataset = load_data(args)
  dataset = prepare_data(dataset)

  results = dataset.map(
        benchmark, batch_size=args.batch_size, batched=True, remove_columns=["audio"],
    )

  # Post-processing - delete weird results
  del_idx = []
  for i in range(len(results)):
      reference = results[i]['references'].split()
      prediction = results[i]['predictions'].split()

      if len(prediction) > 2 * len(reference):
          del_idx.append(i)

  results = results.select(
      (
          i for i in range(len(results))
          if i not in set(del_idx)
      )
  )

  all_results = {
      "audio_length_s": [],
      "transcription_time_s": [],
      "predictions": [],
      "references": [],
  }
  result_iter = iter(results)
  for result in tqdm(result_iter, desc="Samples..."):
      for key in all_results:
          all_results[key].append(result[key])

  # Write manifest results (WER and RTFX)
  manifest_path = write_manifest(
      all_results["references"],
      all_results["predictions"],
      args.model_id,
      args.dataset_path,
      args.dataset,
      audio_length=all_results["audio_length_s"],
      transcription_time=all_results["transcription_time_s"],
  )
  print("Results saved at path:", os.path.abspath(manifest_path))

  wer = wer_metric.compute(
      references=all_results["references"], predictions=all_results["predictions"]
  )
  wer = round(100 * wer, 2)
  rtfx = round(sum(all_results["audio_length_s"]) / sum(all_results["transcription_time_s"]), 2)
  print("WER:", wer, "%", "RTFx:", rtfx)



README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/525 [00:00<?, ? examples/s]

Map:   0%|          | 0/525 [00:00<?, ? examples/s]

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/525 [00:00<?, ? examples/s]

W0725 13:53:46.190000 320 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode
AUTOTUNE addmm(22688x1024, 22688x512, 512x1024)
  bias_addmm 2.4668 ms 100.0% 
  addmm 2.9737 ms 83.0% 
SingleProcess AUTOTUNE benchmarking takes 0.3021 seconds and 0.0005 seconds precompiling for 2 choices
AUTOTUNE addmm(22688x50, 22688x1024, 1024x50)
  addmm 0.5868 ms 100.0% 
  bias_addmm 0.6011 ms 97.6% 
SingleProcess AUTOTUNE benchmarking takes 0.2324 seconds and 0.0005 seconds precompiling for 2 choices
AUTOTUNE addmm(4848x1024, 4848x512, 512x1024)
  bias_addmm 0.4516 ms 100.0% 
  addmm 0.5212 ms 86.6% 
SingleProcess AUTOTUNE benchmarking takes 0.2376 seconds and 0.0005 seconds precompiling for 2 choices
AUTOTUNE addmm(4848x50, 4848x1024, 1024x50)
  addmm 0.1505 ms 100.0% 
  bias_addmm 0.1587 ms 94.8% 
SingleProcess AUTOTUNE benchmarking takes 0.2196 seconds and 0.0004 seconds precompiling for 2 choices


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.98 GiB. GPU 0 has a total capacity of 22.16 GiB of which 2.81 GiB is free. Process 3187 has 19.34 GiB memory in use. Of the allocated memory 18.82 GiB is allocated by PyTorch, with 17.58 GiB allocated in private pools (e.g., CUDA Graphs), and 278.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)