# Whisper Evaluation on Child Speech Testing Data

## Setup of the Evaluation Functions

In [1]:
import os, re, unicodedata, string, contextlib, wave
from difflib import get_close_matches
import whisper

# Key that can extract labels from file names
_LABEL_PAT = re.compile(r"^(\d{1,2})_")

# English words and their similar words
_EN_NUM_WORDS = {
    0: ["zero", "oh", "o", "0"],
    1: ["one", "1"],
    2: ["two", "too", "to", "2"],
    3: ["three", "tree", "free", "3"],
    4: ["four", "for", "fore", "4"],
    5: ["five", "5"],
    6: ["six", "6"],
    7: ["seven", "7"],
    8: ["eight", "ate", "8"],
    9: ["nine", "9"],
    10: ["ten", "10"],
}
# Afrikaans words and their similar words
_AFR_NUM_WORDS = {
    0:  ["boom", "boem"],
    1:  ["eet", "eat"],
    2:  ["hond"],
    3:  ["huis"],
    4:  ["kwaad"],
    5:  ["lekker"],
    6:  ["nee", "knee"],
    7:  ["seuntjie"],
    8:  ["sien", "seen"],
    9:  ["wors", "worse"],
    10: ["ja", "ya"],
}

_WORD_TO_EN = {w: n for n, ws in _EN_NUM_WORDS.items() for w in ws}
_WORD_TO_AF = {w: n for n, ws in _AFR_NUM_WORDS.items() for w in ws}
# Also make sure model accepts digits as correct
for _n in range(0, 11):
    _WORD_TO_EN[str(_n)] = _n
    _WORD_TO_AF[str(_n)] = _n

def _normalize(s: str) -> str:
    s = s.lower()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = s.translate(str.maketrans("", "", string.punctuation))
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Accept either 1–2 digits or literal "NoNum" before underscore
_LABEL_PAT = re.compile(r"^(NoNum|\d{1,2})_")

def _true_from_filename(fname: str):
    m = _LABEL_PAT.match(fname)
    if not m:
        return None
    g = m.group(1)
    return 10 if g == "NoNum" else (int(g) if 0 <= int(g) <= 10 else None)


def _pred_from_transcript(txt: str, lang: str = "en", fuzzy: bool = True, cutoff: float = 0.8):
    t = _normalize(txt)
    toks = t.split()

    if lang == "af":
        vocab_map = _WORD_TO_AF
        fillers = {"dis", "dit", "is", "die", "nommer", "syfer", "'n", "n", "die", "ja-nee"}
    else:
        vocab_map = _WORD_TO_EN
        fillers = {"its", "it", "is", "the", "number", "digit", "a", "an"}

    toks = [tok for tok in toks if tok not in fillers]

    # exact vocab match
    for tok in toks:
        if tok in vocab_map:
            return vocab_map[tok]

    # numeric token (0..10)
    for tok in toks:
        if tok.isdigit():
            v = int(tok)
            if 0 <= v <= 10:
                return v

    # fuzzy match
    if fuzzy:
        vocab = list(vocab_map.keys())
        for tok in toks:
            if len(tok) < 3:
                continue
            m = get_close_matches(tok, vocab, n=1, cutoff=cutoff)
            if m:
                return vocab_map[m[0]]
    return None

def _duration_sec(path: str):
    try:
        with contextlib.closing(wave.open(path, 'r')) as wf:
            frames = wf.getnframes()
            rate = wf.getframerate()
            return frames / float(rate) if rate else None
    except Exception:
        return None

def accuracy_whisper(
    folder_path: str,
    model_name: str = "small",
    lang: str = "en",                 # "en" or "af"
    min_duration_sec: float = 0.1,
    debug_misses: bool = True,
    debug_limit: int = 20,
):
    """
    Recursively evaluates WAVs under folder_path.
    - Expects label = 0..10 before first underscore (e.g., '04_00.wav', '00_00.wav').
    - Skips files with missing label
    - Returns accuracy in [0,1].
    """
    model = whisper.load_model(model_name)

    total, correct = 0, 0
    skipped_unlabeled = 0
    skipped_short = 0
    misses_printed = 0

    # Show how many wav files were found (as a preview)
    preview = []
    for dp, _, files in os.walk(folder_path):
        for f in files:
            if f.lower().endswith(".wav"):
                preview.append(os.path.join(dp, f))
    print(f"Found WAVs (recursive): {len(preview)}")

    # Set whisper's language
    whisper_lang = "af" if lang == "af" else "en"

    for fullpath in sorted(preview):
        fname = os.path.basename(fullpath)
        true_lab = _true_from_filename(fname)
        if true_lab is None:
            skipped_unlabeled += 1
            continue

        dur = _duration_sec(fullpath)
        if dur is not None and dur < min_duration_sec:
            skipped_short += 1
            continue

        result = model.transcribe(fullpath, fp16=False, language=whisper_lang, task="transcribe")
        text = (result.get("text") or "").strip()
        pred = _pred_from_transcript(text, lang=lang)

        total += 1
        if pred == true_lab:
            correct += 1
        else:
            if debug_misses and misses_printed < debug_limit:
                if dur:
                    print(f"MISS {fname} → {text!r}  pred={pred}  true={true_lab}  dur={dur:.2f}s")
                else:
                    print(f"MISS {fname} → {text!r}  pred={pred}  true={true_lab}")
                misses_printed += 1

    print(f"\nEvaluated: {total}  | Skipped (unlabeled): {skipped_unlabeled}  | Skipped (short<{min_duration_sec}s): {skipped_short}")
    return (correct / total) if total > 0 else 0.0

# Wrappers for English and Afrikaans
def accuracy_whisper_en(folder_path, model_name="small", min_duration_sec=0.12, debug_misses=True, debug_limit=20):
    return accuracy_whisper(folder_path, model_name=model_name, lang="en",
                            min_duration_sec=min_duration_sec, debug_misses=debug_misses, debug_limit=debug_limit)

def accuracy_whisper_af(folder_path, model_name="small", min_duration_sec=0.12, debug_misses=True, debug_limit=20):
    return accuracy_whisper(folder_path, model_name=model_name, lang="af",
                            min_duration_sec=min_duration_sec, debug_misses=debug_misses, debug_limit=debug_limit)


## Evaluating the Functions

### English Evaluation

In [5]:
acc = accuracy_whisper_en("TestingData/OnlyNumbers/English/", model_name="large")

print(f"\nWhisper accuracy on English Child Speech: {acc:.2%}")

Found WAVs (recursive): 102
MISS 02_00.wav → 'Thank you.'  pred=None  true=2  dur=0.83s
MISS 03_00.wav → 'Do it.'  pred=None  true=3  dur=1.07s
MISS 06_00.wav → 'Sex.'  pred=None  true=6  dur=1.26s
MISS 07_00.wav → 'I love it.'  pred=None  true=7  dur=0.82s
MISS 09_00.wav → 'Not.'  pred=None  true=9  dur=1.41s
MISS 01_00.wav → 'corn'  pred=None  true=1  dur=0.93s
MISS 04_00.wav → ''  pred=None  true=4  dur=0.53s
MISS 05_00.wav → 'Bye.'  pred=None  true=5  dur=0.68s
MISS 06_00.wav → 'Thank you.'  pred=None  true=6  dur=0.84s
MISS 07_00.wav → "We're done with it."  pred=1  true=7  dur=0.68s
MISS 08_00.wav → ''  pred=None  true=8  dur=0.65s
MISS 09_00.wav → 'Normally.'  pred=None  true=9  dur=1.19s
MISS 09_01.wav → 'Bye.'  pred=None  true=9  dur=0.59s
MISS 07_01.wav → 'SECONDS!'  pred=None  true=7  dur=1.57s
MISS 09_00.wav → 'Bye.'  pred=None  true=9  dur=1.00s
MISS 00_00.wav → 'you'  pred=None  true=0  dur=0.62s
MISS 02_00.wav → ''  pred=None  true=2  dur=0.61s
MISS 03_00.wav → ''  pred=

### Afrikaans Evaluation

In [None]:
acc = accuracy_whisper_af("TestingData/OnlyNumbers/Afrikaans/", model_name="large")

print(f"\nWhisper accuracy on Afrikaans Child Speech: {acc:.2%}")

Found WAVs (recursive): 95
MISS 00_01.wav → 'boel.'  pred=None  true=0  dur=0.42s
MISS 00_03.wav → 'Om.'  pred=None  true=0  dur=0.35s
MISS 00_04.wav → 'Poem!'  pred=None  true=0  dur=0.54s
MISS 00_09.wav → 'Poem.'  pred=None  true=0  dur=0.34s
MISS 01_00.wav → ''  pred=None  true=1  dur=0.29s
MISS 01_02.wav → ''  pred=None  true=1  dur=0.27s
MISS 01_03.wav → 'Ek.'  pred=None  true=1  dur=0.26s
MISS 01_04.wav → 'ek...'  pred=None  true=1  dur=0.30s
MISS 01_05.wav → 'Ondertitels ingediend door die Amara.org gemeenschap'  pred=None  true=1  dur=0.20s
MISS 01_08.wav → 'Ondertitels ingediend door die Amara.org gemeenschap'  pred=None  true=1  dur=0.22s
MISS 02_00.wav → 'want'  pred=None  true=2  dur=0.57s
MISS 02_01.wav → 'Hoor dit.'  pred=None  true=2  dur=0.64s
MISS 02_03.wav → 'Onds.'  pred=None  true=2  dur=0.47s
MISS 02_04.wav → 'Ondertitels ingediend door die Amara.org gemeenschap'  pred=None  true=2  dur=0.33s
MISS 02_05.wav → 'Bon...'  pred=None  true=2  dur=0.58s
MISS 02_06.wav → 