# ASR Commands Model Evaluation

This notebook loads the archived ASR commands model and evaluates it on the test set. Metrics and visualizations are saved to the evaluation directory.

In [None]:
# Setup: imports and paths
import sys
from pathlib import Path
import json
import numpy as np
import torch
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
from datasets import Dataset, Audio
import matplotlib.pyplot as plt

EVAL_DIR = Path('../evaluation/asr_commands').resolve()
MODEL_DIR = Path('../../models/asr_commands_best').resolve()  # Adjust if needed
SPLITS_PATH = Path('../../outputs/asr_commands/preprocessing/splits.json').resolve()

sys.path.insert(0, str(EVAL_DIR))
from eval_utils import compute_metrics

In [None]:
# Load test split and labels
with open(SPLITS_PATH, 'r', encoding='utf-8') as f:
    splits = json.load(f)
labels = splits['labels']
test_records = splits['splits']['test']

test_df = [
    {'audio': r['path'], 'label': labels.index(r['label'])} for r in test_records
]
test_ds = Dataset.from_list(test_df).cast_column('audio', Audio(sampling_rate=16000))

In [None]:
# Load model and feature extractor
model = AutoModelForAudioClassification.from_pretrained(MODEL_DIR)
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_DIR)
model.eval()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

In [None]:
# Preprocess test set
def preprocess(batch):
    audio = batch['audio']
    out = feature_extractor(audio['array'], sampling_rate=audio['sampling_rate'])
    batch['input_values'] = out['input_values'][0]
    return batch

test_ds_proc = test_ds.map(preprocess)

In [None]:
# Run inference
all_preds, all_labels = [], []
with torch.no_grad():
    for ex in test_ds_proc:
        inp = torch.tensor(ex['input_values']).unsqueeze(0).to(device)
        logits = model(inp).logits.cpu().numpy()[0]
        pred = np.argmax(logits)
        all_preds.append(pred)
        all_labels.append(ex['label'])
all_preds, all_labels = np.array(all_preds), np.array(all_labels)

In [None]:
# Compute and save metrics
metrics = compute_metrics(all_labels, all_preds, labels)
with open(EVAL_DIR / 'metrics.json', 'w', encoding='utf-8') as f:
    json.dump(metrics, f, indent=2)
metrics

In [None]:
# Visualize confusion matrix
cm = np.array(metrics['confusion_matrix'])
fig, ax = plt.subplots(figsize=(8,8))
im = ax.imshow(cm, cmap='Blues')
ax.set_xticks(np.arange(len(labels)))
ax.set_yticks(np.arange(len(labels)))
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.set_yticklabels(labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.colorbar(im)
plt.tight_layout()
plt.savefig(EVAL_DIR / 'confusion_matrix.png')
plt.show()

## Results
- Metrics are saved to `metrics.json` in the evaluation directory.
- Confusion matrix is saved as `confusion_matrix.png`.
- See the classification report in the metrics output above.