# Setup

In [None]:
import os

# Helper functions
from utils.segment_audio import segment_audio_file
from utils.convert_m4a_to_wav import convert_m4a_to_wav
from utils.transcribe import transcribe_segments
from utils.check_mono import check_and_convert_to_mono
from utils.process_gold_transcripts import process_gold_transcripts
from utils.wer_evaluator import *
from utils.counters import *

# Load WER metric
from evaluate import load

# Constants
RAW_DATA_DIR = os.path.join('data', 'raw')
CONVERTED_DATA_DIR = os.path.join('data', 'converted')
REFERENCE_TRANSCRIPTS_DIR = os.path.join('data', 'reference_transcripts')
OUTPUT_DIR = 'output'
MAX_SEGMENT_LENGTH = 30 * 1000  # 30 seconds in milliseconds

# Read API key from .secrets/hf_api_key.txt
with open(os.path.join('.secrets', 'hf_api_key.txt'), 'r') as file:
    AUTH_TOKEN = file.read().strip()

# List of Whisper models to use
MODELS = [
    "golesheed/whisper-native-elderly-9-dutch",
    "golesheed/wav2vec2-xls-r-1b-dutch",
    "openai/whisper-large-v3",
    "openai/whisper-large-v2",
    "openai/whisper-small",
    "openai/whisper-medium",
    #"ibm-granite/granite-speech-3.3-2b"
]

  from .autonotebook import tqdm as notebook_tqdm


# Reformat fragments

In [2]:
# Correct filenames and formats
for filename in os.listdir(RAW_DATA_DIR):
    if filename == '.DS_Store':
        continue
    match = re.search(r'(\d+)', filename)
    if match:
        number = match.group(1)
        new_filename = f"interaction_R{number}{os.path.splitext(filename)[1]}"
        old_path = os.path.join(RAW_DATA_DIR, filename)
        new_path = os.path.join(RAW_DATA_DIR, new_filename)
        os.rename(old_path, new_path)
        convert_m4a_to_wav(new_filename) 

# Segment fragments

In [3]:
# Obtain short segments of audio for each speaker from the original recordings in data/raw
# Save in data/converted, do some manual cleanup afterwards and remove segments of Welzijn.AI bot or experimenter
for filename in os.listdir(CONVERTED_DATA_DIR):
            file_path = os.path.join(CONVERTED_DATA_DIR, filename)
            segment_audio_file(file_path, OUTPUT_DIR)


# Convert fragments to mono

In [None]:
check_and_convert_to_mono(os.path.join(OUTPUT_DIR, "segments"))

# Transcribe fragments

In [4]:
#transcribe_segments(MODELS[:1])

# Pre- and postprocess transcription formats

In [2]:
# Create different sets of gold transcripts for different types of WER evaluation
process_gold_transcripts()

Processed files from: data/reference_transcripts/orthographic
  Orthographic_clean: data/reference_transcripts/orthographic_clean
  Normalized: data/reference_transcripts/normalized


# Compute WER

## Orthographic clean

In [None]:
# Load WER metric
wer_metric = load("wer")

# Load reference transcripts with fillers removed, but where no further normalisation is done.
# I.e. capitals and punctuation are preserved. This seems most fitting for whisper-large v3 and v2, and small and medium,
# that predict punctuation and capitals. It is less fitting for Dutch Whisper model for older individuals,
# as it does not predict punctuation and capitals, but does predict fillers
reference_orth_clean = read_reference_transcripts('data/reference_transcripts/orthographic_clean')

for m in [m.split('/')[-1] for m in MODELS]:
    wer = wer_metric.compute(references=reference_orth_clean, predictions=read_asr_transcripts()[m])
    print(f"WER for {m}: {wer:.2f}")

WER for whisper-native-elderly-9-dutch: 0.42
WER for wav2vec2-xls-r-1b-dutch: 0.58
WER for whisper-large-v3: 0.08
WER for whisper-large-v2: 0.15
WER for whisper-small: 0.23
WER for whisper-medium: 0.16


In [5]:
# Load reference transcripts with punctuation and capitals removed.
# This seems most fitting for Dutch Whisper model fine-tuned for older individuals, as it 
# does not predict punctuation and capitals, but does predict fillers
reference_norm = read_reference_transcripts('data/reference_transcripts/normalized')

for m in [m.split('/')[-1] for m in MODELS]:
    wer = wer_metric.compute(references=reference_norm, predictions=read_asr_transcripts()[m])
    print(f"WER for {m}: {wer:.2f}")

WER for whisper-native-elderly-9-dutch: 0.16
WER for wav2vec2-xls-r-1b-dutch: 0.54
WER for whisper-large-v3: 0.34
WER for whisper-large-v2: 0.37
WER for whisper-small: 0.41
WER for whisper-medium: 0.37


# Simple stats

In [5]:
# What is the size of the dataset?
print(f"Total words orthographic_clean: {count_total_words('data/reference_transcripts/orthographic_clean')}")
print(f"Total words normalized: {count_total_words('data/reference_transcripts/normalized')}")
print(f"Total minutes duration segments: {get_total_audio_duration('output/segments')/60:.2f}")

Total words orthographic_clean: 1960
Total words normalized: 2033
Total minutes duration segments: 11.15
