# Whisper Model Evaluation & Testing Notebook

This Colab notebook allows you to load your fine-tuned Whisper model and evaluate or test it in multiple modes:
- Evaluate on test set samples
- Evaluate with user-provided audio/text files (Gradio)
- Test with audio file (Gradio)
- Test with live microphone audio

It also supports language selection (auto/manual) and displays reference vs hypothesis with WER calculation.

In [1]:
# Import Required Libraries
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import gradio as gr
import jiwer
import librosa
import numpy as np
import os
from typing import List
import IPython.display as ipd

# For microphone input
try:
    import sounddevice as sd
except ImportError:
    sd = None

# For widgets
try:
    import ipywidgets as widgets
    from IPython.display import display
except ImportError:
    widgets = None
    display = None


## Load Trained Whisper Model

Load your fine-tuned Whisper model from a local directory or Hugging Face Hub. Specify the model path or repo ID below.

In [2]:
from huggingface_hub import login

# Log in to the Hugging Face Hub with your token.
try:
    login(token="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")
except Exception as e:
    print(f"Error during Hugging Face login: {e}")
    print("Please double-check your hardcoded token.")

In [3]:
from huggingface_hub import snapshot_download
import os
import shutil

MODEL_PATH = "./checkpoints/whisper-medium-assamese/checkpoint-4000/"  # Your trained model dir
BASE_MODEL_REPO = "openai/whisper-medium"  # Change if you used a different base model

# List of files typically needed for Whisper processor/tokenizer
required_files = [
    "preprocessor_config.json",
    "tokenizer.json",
    "special_tokens_map.json",
    "tokenizer_config.json",
    "vocab.json",
    "merges.txt"
]

missing = []
for fname in required_files:
    if not os.path.exists(os.path.join(MODEL_PATH, fname)):
        missing.append(fname)

if missing:
    print(f"Missing files in {MODEL_PATH}: {missing}")
    print(f"Downloading from Hugging Face Hub: {BASE_MODEL_REPO} ...")
    # Download snapshot to a temporary directory
    base_model_dir = snapshot_download(BASE_MODEL_REPO, allow_patterns=required_files)
    for fname in missing:
        src = os.path.join(base_model_dir, fname)
        dst = os.path.join(MODEL_PATH, fname)
        if os.path.exists(src):
            shutil.copy(src, dst)
            print(f"Copied {fname}")
        else:
            print(f"WARNING: {fname} not found in {BASE_MODEL_REPO}, skipping.")
else:
    print("All processor/tokenizer files present in model directory.")
    

Missing files in ./checkpoints/whisper-medium-assamese/checkpoint-4000/: ['tokenizer.json', 'special_tokens_map.json', 'tokenizer_config.json', 'vocab.json', 'merges.txt']
Downloading from Hugging Face Hub: openai/whisper-medium ...


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Copied tokenizer.json
Copied special_tokens_map.json
Copied tokenizer_config.json
Copied vocab.json
Copied merges.txt


In [4]:
# Specify the path to your trained model (local directory or Hugging Face repo ID)
MODEL_PATH = "./checkpoints/whisper-medium-assamese/checkpoint-4000/"  # Change as needed

# Load processor and model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
processor = WhisperProcessor.from_pretrained(MODEL_PATH)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)
model.eval()
print(f"Model loaded from {MODEL_PATH} on {device}")

Model loaded from ./checkpoints/whisper-medium-assamese/checkpoint-4000/ on cpu


## Language Selection (Auto/Manual)

Choose whether to use automatic language detection or manually specify the language for transcription.

In [5]:
# Language selection widget (auto/manual)
language_mode = "auto"  # or "manual"
manual_language = "as"   # default ISO code for Assamese

if widgets:
    language_mode_widget = widgets.ToggleButtons(
        options=[('Auto', 'auto'), ('Manual', 'manual')],
        value='auto',
        description='Language Mode:',
        disabled=False,
        button_style=''
    )
    manual_language_widget = widgets.Text(
        value='as',
        description='Language:',
        disabled=False
    )
    def on_language_mode_change(change):
        global language_mode
        language_mode = change['new']
        if language_mode == 'manual':
            display(manual_language_widget)
    language_mode_widget.observe(on_language_mode_change, names='value')
    display(language_mode_widget)
else:
    print("ipywidgets not available. Set language_mode and manual_language variables manually.")

ToggleButtons(description='Language Mode:', options=(('Auto', 'auto'), ('Manual', 'manual')), value='auto')

Text(value='as', description='Language:')

## Mode Selection

Select the mode for evaluation or testing: 
- Eval on test set
- Eval with user files (Gradio)
- Test with audio file (Gradio)
- Test with live mic audio

In [6]:
# Mode selection widget
eval_modes = [
    ("Eval: Test Set Samples", "eval_testset"),
    ("Eval: User Audio/Text Files (Gradio)", "eval_user_files"),
    ("Test: Transcribe Audio File (Gradio)", "test_audio_file"),
    ("Test: Live Mic Transcription", "test_live_mic")
]
selected_mode = "eval_testset"

if widgets:
    mode_widget = widgets.ToggleButtons(
        options=eval_modes,
        value='eval_testset',
        description='Mode:',
        disabled=False,
        button_style=''
    )
    display(mode_widget)
else:
    print("ipywidgets not available. Set selected_mode variable manually.")

ToggleButtons(description='Mode:', options=(('Eval: Test Set Samples', 'eval_testset'), ('Eval: User Audio/Tex…

## Display Reference vs Hypothesis and Calculate WER

For all evaluation modes, display the reference and hypothesis transcriptions side by side and calculate the Word Error Rate (WER).

In [7]:
# Utility function to display reference vs hypothesis and calculate WER
def display_ref_vs_hyp(refs: List[str], hyps: List[str]):
    for i, (ref, hyp) in enumerate(zip(refs, hyps)):
        print(f"Sample {i+1}:")
        print(f"Reference:   {ref}")
        print(f"Hypothesis:  {hyp}")
        print("-")
    wer = jiwer.wer(refs, hyps)
    print(f"WER: {wer:.3f}")

# Example usage (after running an eval mode):
# display_ref_vs_hyp(references, hypotheses)

## Eval Mode: Evaluate on Test Set Samples

Select a number of samples from the test set, run inference, and calculate WER.

In [8]:
# Example: Load test set and evaluate a few samples
import random
from datasets import load_dataset

def transcribe(audio, language=None):
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
    if language_mode == 'manual' and manual_language:
        forced_lang = manual_language
    else:
        forced_lang = None
    input_features = inputs.input_features.to(device)
    with torch.no_grad():
        predicted_ids = model.generate(input_features, language=forced_lang)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

# Load test set (update as needed)
testset = load_dataset("mozilla-foundation/common_voice_11_0", lang, split="test", cache_dir="./datasets")
num_samples = 5  # Change as needed
samples = random.sample(list(testset), num_samples)

references = []
hypotheses = []
for sample in samples:
    audio = sample['audio']['array']
    ref = sample['sentence']
    hyp = transcribe(audio)
    references.append(ref)
    hypotheses.append(hyp)
    print(f"Reference: {ref}\nHypothesis: {hyp}\n---")

# Calculate WER and display results
wer = jiwer.wer(references, hypotheses)
print(f"WER on {num_samples} test samples: {wer:.3f}")
display_ref_vs_hyp(references, hypotheses)

NameError: name 'lang' is not defined

## Eval Mode: User Provided Audio and Text Files (Gradio)

Upload audio and reference text files for evaluation. The notebook will transcribe the audio, display reference vs hypothesis, and calculate WER.

In [None]:
# Gradio interface for user-provided audio and text files
def eval_user_files(audio_file, text_file):
    audio, sr = librosa.load(audio_file, sr=16000)
    with open(text_file, 'r', encoding='utf-8') as f:
        reference = f.read().strip()
    hypothesis = transcribe(audio)
    # Use display_ref_vs_hyp for consistent output
    import io
    import sys
    buf = io.StringIO()
    sys.stdout = buf
    display_ref_vs_hyp([reference], [hypothesis])
    sys.stdout = sys.__stdout__
    return buf.getvalue()

gr.Interface(
    fn=eval_user_files,
    inputs=[gr.Audio(source="upload", type="filepath", label="Audio File"), gr.File(label="Reference Text File")],
    outputs="text",
    title="Eval: User Provided Audio and Text Files",
    description="Upload an audio file and a reference text file to evaluate WER."
).launch(share=False)

## Test Mode: Transcribe from Audio File (Gradio)

Upload an audio file to transcribe using the trained Whisper model. The transcription will be displayed below.

In [None]:
# Gradio interface for audio file transcription
def transcribe_audio_file(audio_file):
    audio, sr = librosa.load(audio_file, sr=16000)
    transcription = transcribe(audio)
    return transcription

gr.Interface(
    fn=transcribe_audio_file,
    inputs=gr.Audio(source="upload", type="filepath", label="Audio File"),
    outputs="text",
    title="Test: Transcribe Audio File",
    description="Upload an audio file to transcribe using the trained Whisper model."
).launch(share=False)

## Test Mode: Live Transcription from Microphone

Capture audio from your microphone and transcribe it in real time using the trained Whisper model.

In [None]:
# Live microphone transcription (requires sounddevice)
def record_and_transcribe(duration=5, fs=16000):
    if sd is None:
        return "sounddevice not installed."
    print(f"Recording {duration} seconds of audio...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
    sd.wait()
    audio = audio.flatten()
    transcription = transcribe(audio)
    print(f"Transcription: {transcription}")
    return transcription

if sd is not None:
    duration = 5  # seconds
    print("Click in the cell and run to record from your mic.")
    record_and_transcribe(duration=duration)
else:
    print("sounddevice not available. Install it to use live mic transcription.")