# Whisper Evaluation on Child Speech Testing Data

## Setup of the Evaluation Functions

In [None]:
import os, re, unicodedata, string, contextlib, wave
from difflib import get_close_matches
import whisper


# Key that can extract labels from file names
_LABEL_PAT = re.compile(r"^(\d{1,2})_")

# English words and their similar words
_EN_NUM_WORDS = {
    0: ["zero", "oh", "o", "0"],
    1: ["one", "1"],
    2: ["two", "too", "to", "2"],
    3: ["three", "tree", "free", "3"],
    4: ["four", "for", "fore", "4"],
    5: ["five", "5"],
    6: ["six", "6"],
    7: ["seven", "7"],
    8: ["eight", "ate", "8"],
    9: ["nine", "9"],
    10: ["ten", "10"],
}
# Afrikaans words and their similar words
_AFR_NUM_WORDS = {
    0:  ["boom", "boem"],
    1:  ["eet", "eat"],
    2:  ["hond"],
    3:  ["huis"],
    4:  ["kwaad"],
    5:  ["lekker"],
    6:  ["nee", "knee"],
    7:  ["seuntjie"],
    8:  ["sien", "seen"],
    9:  ["wors", "worse"],
    10: ["ja", "ya"],
}

_WORD_TO_EN = {w: n for n, ws in _EN_NUM_WORDS.items() for w in ws}
_WORD_TO_AF = {w: n for n, ws in _AFR_NUM_WORDS.items() for w in ws}
# Also make sure model accepts digits as correct
for _n in range(0, 11):
    _WORD_TO_EN[str(_n)] = _n
    _WORD_TO_AF[str(_n)] = _n

def _normalize(s: str) -> str:
    s = s.lower()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = s.translate(str.maketrans("", "", string.punctuation))
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Accept either 1–2 digits or literal "NoNum" before underscore
_LABEL_PAT = re.compile(r"^(NoNum|\d{1,2})_")

def _true_from_filename(fname: str):
    m = _LABEL_PAT.match(fname)
    if not m:
        return None
    g = m.group(1)
    return 10 if g == "NoNum" else (int(g) if 0 <= int(g) <= 10 else None)


def _pred_from_transcript(txt: str, lang: str = "en", fuzzy: bool = True, cutoff: float = 0.8):
    t = _normalize(txt)
    toks = t.split()

    if lang == "af":
        vocab_map = _WORD_TO_AF
        fillers = {"dis", "dit", "is", "die", "nommer", "syfer", "'n", "n", "die", "ja-nee"}
    else:
        vocab_map = _WORD_TO_EN
        fillers = {"its", "it", "is", "the", "number", "digit", "a", "an"}

    toks = [tok for tok in toks if tok not in fillers]

    # exact vocab match
    for tok in toks:
        if tok in vocab_map:
            return vocab_map[tok]

    # numeric token (0..10)
    for tok in toks:
        if tok.isdigit():
            v = int(tok)
            if 0 <= v <= 10:
                return v

    # fuzzy match
    if fuzzy:
        vocab = list(vocab_map.keys())
        for tok in toks:
            if len(tok) < 3:
                continue
            m = get_close_matches(tok, vocab, n=1, cutoff=cutoff)
            if m:
                return vocab_map[m[0]]
    return None

def _duration_sec(path: str):
    try:
        with contextlib.closing(wave.open(path, 'r')) as wf:
            frames = wf.getnframes()
            rate = wf.getframerate()
            return frames / float(rate) if rate else None
    except Exception:
        return None

def accuracy_whisper(
    folder_path: str,
    model_name: str = "small",
    lang: str = "en",                 # "en" or "af"
    min_duration_sec: float = 0.1,
    debug_misses: bool = True,
    debug_limit: int = 20,
):
    """
    Recursively evaluates WAVs under folder_path.
    - Expects label = 0..10 before first underscore (e.g., '04_00.wav', '00_00.wav').
    - Skips files with missing label
    - Returns accuracy in [0,1].
    """
    model = whisper.load_model(model_name)

    total, correct = 0, 0
    skipped_unlabeled = 0
    skipped_short = 0
    misses_printed = 0

    # Show how many wav files were found (as a preview)
    preview = []
    for dp, _, files in os.walk(folder_path):
        for f in files:
            if f.lower().endswith(".wav"):
                preview.append(os.path.join(dp, f))
    print(f"Found WAVs (recursive): {len(preview)}")

    # Set whisper's language
    whisper_lang = "af" if lang == "af" else "en"

    for fullpath in sorted(preview):
        fname = os.path.basename(fullpath)
        true_lab = _true_from_filename(fname)
        if true_lab is None:
            skipped_unlabeled += 1
            continue

        dur = _duration_sec(fullpath)
        if dur is not None and dur < min_duration_sec:
            skipped_short += 1
            continue

        result = model.transcribe(fullpath, fp16=False, language=whisper_lang, task="transcribe")


        text = (result.get("text") or "").strip()
        pred = _pred_from_transcript(text, lang=lang)

        total += 1
        if pred == true_lab:
            correct += 1
        else:
            if debug_misses and misses_printed < debug_limit:
                if dur:
                    print(f"MISS {fname} → {text!r}  pred={pred}  true={true_lab}  dur={dur:.2f}s")
                else:
                    print(f"MISS {fname} → {text!r}  pred={pred}  true={true_lab}")
                misses_printed += 1

    print(f"\nEvaluated: {total}  | Skipped (unlabeled): {skipped_unlabeled}  | Skipped (short<{min_duration_sec}s): {skipped_short}")
    return (correct / total) if total > 0 else 0.0

# Wrappers for English and Afrikaans
def accuracy_whisper_en(folder_path, model_name="small", min_duration_sec=0.12, debug_misses=True, debug_limit=20):
    return accuracy_whisper(folder_path, model_name=model_name, lang="en",
                            min_duration_sec=min_duration_sec, debug_misses=debug_misses, debug_limit=debug_limit)

def accuracy_whisper_af(folder_path, model_name="small", min_duration_sec=0.12, debug_misses=True, debug_limit=20):
    return accuracy_whisper(folder_path, model_name=model_name, lang="af",
                            min_duration_sec=min_duration_sec, debug_misses=debug_misses, debug_limit=debug_limit)


## Evaluating the Functions

### English Evaluation

In [18]:
acc = accuracy_whisper_en("TestingData/OnlyNumbers/English/", model_name="large")

print(f"\nWhisper accuracy on English Child Speech: {acc:.2%}")

Found WAVs (recursive): 102


Python(35286) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35330) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35353) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 02_00.wav → 'Thank you.'  pred=None  true=2  dur=0.83s


Python(35372) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 03_00.wav → 'Do it.'  pred=None  true=3  dur=1.07s


Python(35379) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35383) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 05_00.wav → 'V.'  pred=None  true=5  dur=0.79s


Python(35386) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 06_00.wav → 'Sex.'  pred=None  true=6  dur=1.26s


Python(35388) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 07_00.wav → 'I love it.'  pred=None  true=7  dur=0.82s


Python(35390) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35391) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 09_00.wav → 'Not.'  pred=None  true=9  dur=1.41s


Python(35392) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 01_00.wav → 'corn'  pred=None  true=1  dur=0.93s


Python(35393) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35394) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35395) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 04_00.wav → ''  pred=None  true=4  dur=0.53s


Python(35396) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 05_00.wav → 'Bye.'  pred=None  true=5  dur=0.68s


Python(35397) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 06_00.wav → 'Thank you.'  pred=None  true=6  dur=0.84s


Python(35400) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 07_00.wav → 'That way.'  pred=None  true=7  dur=0.68s


Python(35410) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 08_00.wav → ''  pred=None  true=8  dur=0.65s


Python(35411) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 09_00.wav → 'Normally.'  pred=None  true=9  dur=1.19s


Python(35412) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 09_01.wav → 'Right.'  pred=8  true=9  dur=0.59s


Python(35425) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35426) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35427) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35428) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35429) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 04_00.wav → '54.'  pred=None  true=4  dur=1.45s


Python(35430) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35431) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35434) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 07_01.wav → '생朋友'  pred=None  true=7  dur=1.57s


Python(35437) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 09_00.wav → 'Bye.'  pred=None  true=9  dur=1.00s


Python(35438) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 00_00.wav → 'You.'  pred=None  true=0  dur=0.62s


Python(35439) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35440) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 02_00.wav → ''  pred=None  true=2  dur=0.61s


Python(35441) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 03_00.wav → ''  pred=None  true=3  dur=0.57s


Python(35442) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35443) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35444) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35446) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35450) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35451) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35452) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35453) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35454) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35455) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(35456) Malloc


Evaluated: 102  | Skipped (unlabeled): 0  | Skipped (short<0.12s): 0

Whisper accuracy on English Child Speech: 56.86%


### Afrikaans Evaluation

In [13]:
acc = accuracy_whisper_af("TestingData/OnlyNumbers/Afrikaans/", model_name="large")

print(f"\nWhisper accuracy on Afrikaans Child Speech: {acc:.2%}")

Found WAVs (recursive): 95


Python(29455) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29497) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 00_01.wav → 'boel.'  pred=None  true=0  dur=0.42s


Python(29524) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29573) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 00_03.wav → 'Om.'  pred=None  true=0  dur=0.35s


Python(29593) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 00_04.wav → 'Poem!'  pred=None  true=0  dur=0.54s


Python(29607) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29617) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29622) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29623) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29629) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 00_09.wav → 'Poem.'  pred=None  true=0  dur=0.34s


Python(29681) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 01_00.wav → ''  pred=None  true=1  dur=0.29s


Python(29723) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29737) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 01_02.wav → ''  pred=None  true=1  dur=0.27s


Python(29738) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 01_03.wav → 'Ek.'  pred=None  true=1  dur=0.26s


Python(29744) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 01_04.wav → 'ek...'  pred=None  true=1  dur=0.30s


Python(29749) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 01_05.wav → 'Ondertitels ingediend door die Amara.org gemeenschap'  pred=None  true=1  dur=0.20s


Python(29766) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29772) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29776) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 01_08.wav → 'Ondertitels ingediend door die Amara.org gemeenschap'  pred=None  true=1  dur=0.22s


Python(29787) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29793) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 02_00.wav → 'want'  pred=None  true=2  dur=0.57s


Python(29800) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 02_01.wav → 'Hoor dit.'  pred=None  true=2  dur=0.64s


Python(29846) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29873) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 02_03.wav → 'Onds.'  pred=None  true=2  dur=0.47s


Python(29885) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 02_04.wav → 'Ondertitels ingediend door die Amara.org gemeenschap'  pred=None  true=2  dur=0.33s


Python(29952) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 02_05.wav → 'Bon...'  pred=None  true=2  dur=0.58s


Python(29958) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 02_06.wav → ''  pred=None  true=2  dur=0.41s


Python(29961) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 02_07.wav → 'on.'  pred=None  true=2  dur=0.46s


Python(29962) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 02_08.wav → ''  pred=None  true=2  dur=0.37s


Python(29963) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 02_09.wav → 'one.'  pred=None  true=2  dur=0.40s


Python(29967) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MISS 03_00.wav → ''  pred=None  true=3  dur=0.64s


Python(29976) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29980) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29982) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(29991) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(30007) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(30127) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(30130) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(30132) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(30133) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(30138) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(30142) Malloc


Evaluated: 95  | Skipped (unlabeled): 0  | Skipped (short<0.12s): 0

Whisper accuracy on Afrikaans Child Speech: 22.11%
