# Setup

In [1]:
import os

# Helper functions
from utils.segment_audio import segment_audio_file
from utils.convert_m4a_to_wav import convert_m4a_to_wav
from utils.transcribe import transcribe_segments
from utils.check_mono import check_and_convert_to_mono
from utils.preprocess_gold_transcripts import process_gold_transcripts
from utils.wer_evaluator import *
from utils.counters import *
from utils.postprocess_transcripts import TranscriptCleaner
from utils.read_mozilla_dataset import create_orthographic_files,copy_audio_from_transcripts


# Load WER metric|
from evaluate import load

# Read API key from .secrets/hf_api_key.txt
with open(os.path.join('.secrets', 'hf_api_key.txt'), 'r') as file:
    AUTH_TOKEN = file.read().strip()

# List of Whisper models to use
MODELS = [
    "golesheed/whisper-native-elderly-9-dutch",
    "golesheed/whisper-9-dutch",
    "golesheed/wav2vec2-xls-r-1b-dutch-3",
    "golesheed/wav2vec2-xls-r-1b-dutch",
    "openai/whisper-large-v3",
    "openai/whisper-large-v2",
    "openai/whisper-small",
    "openai/whisper-medium",
    "mistralai/Voxtral-Mini-3B-2507",
    "openai/whisper-large-v3-turbo",
]

  from .autonotebook import tqdm as notebook_tqdm


# Reformat fragments

In [None]:
# Correct filenames and formats
for filename in os.listdir(RAW_DATA_DIR):
    if filename == '.DS_Store':
        continue
    match = re.search(r'(\d+)', filename)
    if match:
        number = match.group(1)
        new_filename = f"interaction_R{number}{os.path.splitext(filename)[1]}"
        old_path = os.path.join(RAW_DATA_DIR, filename)
        new_path = os.path.join(RAW_DATA_DIR, new_filename)
        os.rename(old_path, new_path)
        convert_m4a_to_wav(new_filename) 

# Segment fragments

In [None]:
# Obtain short segments of audio for each speaker from the original recordings in data/raw
# Save in data/converted, do some manual cleanup afterwards and remove segments of Welzijn.AI bot or experimenter
for filename in os.listdir(CONVERTED_DATA_DIR):
            file_path = os.path.join(CONVERTED_DATA_DIR, filename)
            segment_audio_file(file_path, OUTPUT_DIR)


# Convert fragments to mono

In [None]:
check_and_convert_to_mono(os.path.join(OUTPUT_DIR, "segments"))

# Transcribe fragments

In [2]:
# Check this, possibly due to updates on folders this may not work as expected
transcribe_segments(MODELS[-1:], cpu=False, segments_dir=os.path.join('output', 'segments_beatrix'), output_transcript_dir=os.path.join('output', 'transcripts_beatrix'))


Device set to use mps
`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.


Transcribed interaction_R8_SPEAKER_01_seg_14.wav using openai/whisper-large-v3-turbo
Transcribed interaction_R10_SPEAKER_01_seg_19.wav using openai/whisper-large-v3-turbo
Transcribed interaction_R3_SPEAKER_00_seg_5.wav using openai/whisper-large-v3-turbo
Transcribed interaction_R2_SPEAKER_01_seg_10.wav using openai/whisper-large-v3-turbo
Transcribed interaction_R4_SPEAKER_01_seg_63.wav using openai/whisper-large-v3-turbo
Transcribed interaction_R8_SPEAKER_01_seg_5.wav using openai/whisper-large-v3-turbo
Transcribed interaction_R4_SPEAKER_01_seg_89.wav using openai/whisper-large-v3-turbo
Transcribed interaction_R4_SPEAKER_01_seg_62.wav using openai/whisper-large-v3-turbo
Transcribed interaction_R3_SPEAKER_00_seg_14.wav using openai/whisper-large-v3-turbo
Transcribed interaction_R3_SPEAKER_00_seg_4.wav using openai/whisper-large-v3-turbo
Transcribed interaction_R10_SPEAKER_01_seg_18.wav using openai/whisper-large-v3-turbo
Transcribed interaction_R8_SPEAKER_01_seg_17.wav using openai/whis

# Pre- and postprocess gold standard and output transcriptions

In [3]:
# Create different sets of GOLD/REFERENCE transcripts for different types of WER evaluation
process_gold_transcripts(reference_folder=os.path.join('data', 'reference_transcripts_beatrix'))

Processed files from: data/reference_transcripts_beatrix/orthographic
  Orthographic_clean: data/reference_transcripts_beatrix/orthographic_clean
  Normalized: data/reference_transcripts_beatrix/normalized
  Normalized_clean: data/reference_transcripts_beatrix/normalized_clean


In [5]:
# Create one clean set of OUTPUT transcriptions without fillers, punctuation, capitals, trailing or leading whitespace
cleaner = TranscriptCleaner()
cleaner.process_directory(os.path.join("output", "transcripts_beatrix"), os.path.join("output", "transcripts_beatrix_cleaned"))

Removing existing output directory: output/transcripts_beatrix_cleaned
Found 1990 transcript files to process
Reading from: output/transcripts_beatrix
Writing cleaned files to: output/transcripts_beatrix_cleaned
Processed 100 files...
Processed 200 files...
Processed 300 files...
Processed 400 files...
Processed 500 files...
Processed 600 files...
Processed 700 files...
Processed 800 files...
Processed 900 files...
Processed 1000 files...
Processed 1100 files...
Processed 1200 files...
Processed 1300 files...
Processed 1400 files...
Processed 1500 files...
Processed 1600 files...
Processed 1700 files...
Processed 1800 files...
Processed 1900 files...
Completed processing 1990 files
Cleaned transcripts saved to: output/transcripts_beatrix_cleaned
Original transcripts remain unchanged.


# Compute WER

## Orthographic

In [2]:
# Load WER metric
wer_metric = load("wer")

# First the hardest gold/reference transcripts, 'orthographic'
ref_orth = read_reference_transcripts(os.path.join('data', 'reference_transcripts_beatrix', 'orthographic'))

for m in [m.split('/')[-1] for m in MODELS]:
    wer = wer_metric.compute(references=ref_orth, predictions=read_asr_transcripts(os.path.join("output", "transcripts_beatrix"))[m])
    print(f"WER for {m}: {wer:.2f}")

WER for whisper-native-elderly-9-dutch: 0.41
WER for whisper-9-dutch: 0.44
WER for wav2vec2-xls-r-1b-dutch-3: 0.49
WER for wav2vec2-xls-r-1b-dutch: 0.58
WER for whisper-large-v3: 0.12
WER for whisper-large-v2: 0.19
WER for whisper-small: 0.26
WER for whisper-medium: 0.19
WER for Voxtral-Mini-3B-2507: 0.18


## Normalized

In [3]:
# Load reference transcripts with punctuation and capitals removed.
# This seems most fitting for Dutch Whisper model fine-tuned for older individuals, as it 
# does not predict punctuation and capitals, but does predict fillers
ref_norm = read_reference_transcripts(os.path.join('data', 'reference_transcripts_beatrix', 'normalized'))

for m in [m.split('/')[-1] for m in MODELS]:
    wer = wer_metric.compute(references=ref_norm, predictions=read_asr_transcripts(os.path.join("output", "transcripts_beatrix"))[m])
    print(f"WER for {m}: {wer:.2f}")

WER for whisper-native-elderly-9-dutch: 0.20
WER for whisper-9-dutch: 0.25
WER for wav2vec2-xls-r-1b-dutch-3: 0.43
WER for wav2vec2-xls-r-1b-dutch: 0.54
WER for whisper-large-v3: 0.34
WER for whisper-large-v2: 0.37
WER for whisper-small: 0.41
WER for whisper-medium: 0.37
WER for Voxtral-Mini-3B-2507: 0.37


## Normalized_clean

In [4]:
# Load reference transcripts with punctuation and capitals removed.
# This seems most fitting for Dutch Whisper model fine-tuned for older individuals, as it 
# does not predict punctuation and capitals, but does predict fillers
ref_norm_clean = read_reference_transcripts(os.path.join('data', 'reference_transcripts_beatrix', 'normalized_clean'))

for m in [m.split('/')[-1] for m in MODELS]:
    wer = wer_metric.compute(references=ref_norm_clean, predictions=read_asr_transcripts(os.path.join("output", "transcripts_beatrix_cleaned"))[m])
    print(f"WER for {m}: {wer:.2f}")

WER for whisper-native-elderly-9-dutch: 0.16
WER for whisper-9-dutch: 0.20
WER for wav2vec2-xls-r-1b-dutch-3: 0.36
WER for wav2vec2-xls-r-1b-dutch: 0.46
WER for whisper-large-v3: 0.06
WER for whisper-large-v2: 0.10
WER for whisper-small: 0.17
WER for whisper-medium: 0.11
WER for Voxtral-Mini-3B-2507: 0.10


## Simple stats

In [3]:
# What is the size of the dataset?
print(f"Total words Orthographic: {count_total_words(os.path.join('data', 'reference_transcripts_beatrix', 'orthographic'))}")
print(f"Total words Normalized_clean: {count_total_words(os.path.join('data', 'reference_transcripts_beatrix', 'normalized_clean'))}")
print(f"Total minutes duration segments: {get_total_audio_duration(os.path.join('output', 'segments_beatrix'))/60:.2f}")

Total words Orthographic: 2413
Total words Normalized_clean: 1949
Total minutes duration segments: 11.15


In [4]:
(11.15*60)/200

3.345

In [7]:
11.15/200

0.05575

In [8]:
60*0.05

3.0

# Quick experiment with Mozilla CV dataset

# Loading/preprocessing

In [7]:
# Preprocess/load Mozilla CV
create_orthographic_files(os.path.join('data', 'mozilla_cv', 'validated.tsv'), n_rows=50, seed=42)
copy_audio_from_transcripts(os.path.join('data', 'reference_transcripts_mozilla', 'orthographic'), os.path.join('data','mozilla_cv','nl','clips'))

Found 50 transcript files
Copied: common_voice_nl_41445352.mp3
Copied: common_voice_nl_41445420.mp3
Copied: common_voice_nl_26992952.mp3
Copied: common_voice_nl_27120532.mp3
Copied: common_voice_nl_20540645.mp3
Copied: common_voice_nl_20541943.mp3
Copied: common_voice_nl_28388257.mp3
Copied: common_voice_nl_37256311.mp3
Copied: common_voice_nl_39641321.mp3
Copied: common_voice_nl_18322855.mp3
Copied: common_voice_nl_28388331.mp3
Copied: common_voice_nl_30520119.mp3
Copied: common_voice_nl_41445418.mp3
Copied: common_voice_nl_37249192.mp3
Copied: common_voice_nl_41445906.mp3
Copied: common_voice_nl_41445245.mp3
Copied: common_voice_nl_18702069.mp3
Copied: common_voice_nl_41445509.mp3
Copied: common_voice_nl_41445323.mp3
Copied: common_voice_nl_30505924.mp3
Copied: common_voice_nl_20540623.mp3
Copied: common_voice_nl_41445491.mp3
Copied: common_voice_nl_28795644.mp3
Copied: common_voice_nl_28795728.mp3
Copied: common_voice_nl_41445890.mp3
Copied: common_voice_nl_27149855.mp3
Copied: comm

## Transcribing

In [8]:
transcribe_segments(MODELS[-1:], cpu=False, segments_dir=os.path.join('output', 'segments_mozilla'), output_transcript_dir=os.path.join('output', 'transcripts_mozilla'))

Device set to use mps


Transcribed common_voice_nl_30544154.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_19100076.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_37256236.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_18702063.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_28388404.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_27149855.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_28795648.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_28795728.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_41445890.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_41445894.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_37256231.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_28795664.mp3 using openai/whisper-large-v3-turbo
Transcribed common_voice_nl_18695047.mp3 using openai/whisper-large-v3-turbo

## Postprocessing

In [9]:
# Create different sets of GOLD/REFERENCE transcripts for different types of WER evaluation
process_gold_transcripts(os.path.join('data', 'reference_transcripts_mozilla'))

Processed files from: data/reference_transcripts_mozilla/orthographic
  Orthographic_clean: data/reference_transcripts_mozilla/orthographic_clean
  Normalized: data/reference_transcripts_mozilla/normalized
  Normalized_clean: data/reference_transcripts_mozilla/normalized_clean


In [10]:
# Create one clean set of OUTPUT transcriptions without fillers, punctuation, capitals, trailing or leading whitespace
cleaner = TranscriptCleaner()
cleaner.process_directory(os.path.join("output", "transcripts_mozilla"), os.path.join("output", "transcripts_mozilla_cleaned"))

Removing existing output directory: output/transcripts_mozilla_cleaned
Found 500 transcript files to process
Reading from: output/transcripts_mozilla
Writing cleaned files to: output/transcripts_mozilla_cleaned
Processed 100 files...
Processed 200 files...
Processed 300 files...
Processed 400 files...
Processed 500 files...
Completed processing 500 files
Cleaned transcripts saved to: output/transcripts_mozilla_cleaned
Original transcripts remain unchanged.


## WER evaluations

In [9]:
# Load WER metric
wer_metric = load("wer")

# First the hardest gold/reference transcripts, 'orthographic'
ref_orth = read_reference_transcripts(os.path.join('data', 'reference_transcripts_mozilla', 'orthographic'))

for m in [m.split('/')[-1] for m in MODELS]:
    wer = wer_metric.compute(references=ref_orth, predictions=read_asr_transcripts(os.path.join("output", "transcripts_mozilla"))[m])
    print(f"WER for {m}: {wer:.2f}")

WER for whisper-native-elderly-9-dutch: 0.29
WER for whisper-9-dutch: 0.31
WER for wav2vec2-xls-r-1b-dutch-3: 0.29
WER for wav2vec2-xls-r-1b-dutch: 0.32
WER for whisper-large-v3: 0.07
WER for whisper-large-v2: 0.08
WER for whisper-small: 0.14
WER for whisper-medium: 0.09
WER for Voxtral-Mini-3B-2507: 0.08


In [10]:
# Load reference transcripts with punctuation and capitals removed.
# This seems most fitting for Dutch Whisper model fine-tuned for older individuals, as it 
# does not predict punctuation and capitals, but does predict fillers
ref_norm = read_reference_transcripts(os.path.join('data', 'reference_transcripts_mozilla', 'normalized'))

for m in [m.split('/')[-1] for m in MODELS]:
    wer = wer_metric.compute(references=ref_norm, predictions=read_asr_transcripts(os.path.join("output", "transcripts_mozilla"))[m])
    print(f"WER for {m}: {wer:.2f}")

WER for whisper-native-elderly-9-dutch: 0.09
WER for whisper-9-dutch: 0.11
WER for wav2vec2-xls-r-1b-dutch-3: 0.29
WER for wav2vec2-xls-r-1b-dutch: 0.32
WER for whisper-large-v3: 0.26
WER for whisper-large-v2: 0.27
WER for whisper-small: 0.32
WER for whisper-medium: 0.28
WER for Voxtral-Mini-3B-2507: 0.27


In [12]:
# Load reference transcripts with punctuation and capitals removed.
# This seems most fitting for Dutch Whisper model fine-tuned for older individuals, as it 
# does not predict punctuation and capitals, but does predict fillers
ref_norm_clean = read_reference_transcripts(os.path.join('data', 'reference_transcripts_mozilla', 'normalized_clean'))

for m in [m.split('/')[-1] for m in MODELS]:
    wer = wer_metric.compute(references=ref_norm_clean, predictions=read_asr_transcripts(os.path.join("output", "transcripts_mozilla_cleaned"))[m])
    print(f"WER for {m}: {wer:.2f}")

WER for whisper-native-elderly-9-dutch: 0.09
WER for whisper-9-dutch: 0.11
WER for wav2vec2-xls-r-1b-dutch-3: 0.19
WER for wav2vec2-xls-r-1b-dutch: 0.21
WER for whisper-large-v3: 0.05
WER for whisper-large-v2: 0.05
WER for whisper-small: 0.11
WER for whisper-medium: 0.07
WER for Voxtral-Mini-3B-2507: 0.06
WER for whisper-large-v3-turbo: 0.05


In [None]:
# What is the size of the dataset?
print(f"Total words Orthographic: {count_total_words(os.path.join('data', 'reference_transcripts_mozilla', 'orthographic'))}")
print(f"Total words Normalized_clean: {count_total_words(os.path.join('data', 'reference_transcripts_mozilla', 'normalized_clean'))}")
print(f"Total minutes duration segments: {get_total_audio_duration(os.path.join('output', 'segments_mozilla'))/60:.2f}")


Total words Orthographic: 548
Total words Normalized_clean: 488
Total minutes duration segments: 4.44


# Tiberon data

In [14]:
import pandas as pd

df = pd.read_csv('results_elderly_60+.csv')

df.head()

Unnamed: 0,filename,duration,transcription_time,RTF,WER,reference,hypothesis
0,common_voice_nl_17695133,6.6,3,0.52,14.29,men kan geen ijzer met handen breken,men kan geen ijzer met hallen breken
1,common_voice_nl_17695137,6.19,3,0.55,10.0,mijn trekrugzak woog achttien kilo toen we die...,mijn trekrugzak woog achttien kilo toen we die...
2,common_voice_nl_17695138,3.98,2,0.71,0.0,waar zijn de toiletten,waar zijn de toiletten
3,common_voice_nl_17695139,3.79,2,0.72,16.67,er komt pus uit de wonde,er komt pus uit de holde
4,common_voice_nl_17695140,5.38,2,0.54,0.0,ik neem altijd een drinkbus met water mee,ik neem altijd een drinkbus met water mee


In [15]:
import pandas as pd

df = pd.read_csv('results_v2_60+.csv')

df.head()

Unnamed: 0,filename,duration,transcription_time,RTF,WER,reference,hypothesis
0,common_voice_nl_17695133,6.6,3,0.54,14.29,men kan geen ijzer met handen breken,men kan geen ijzer met hallen breken
1,common_voice_nl_17695137,6.19,3,0.53,10.0,mijn trekrugzak woog achttien kilo toen we die...,mijn trekrugzak woog 18 kilo toen we die trek...
2,common_voice_nl_17695138,3.98,2,0.69,0.0,waar zijn de toiletten,waar zijn de toiletten
3,common_voice_nl_17695139,3.79,2,0.75,16.67,er komt pus uit de wonde,er komt pus uit de wollen
4,common_voice_nl_17695140,5.38,2,0.54,0.0,ik neem altijd een drinkbus met water mee,ik neem altijd een drinkbus met water mee
