# 04 - Results Visualization
Analyze trained model performance: confusion matrix, per-class metrics, error analysis.

**Runtime:** CPU is sufficient for this notebook.

In [None]:
# Colab Setup
import os
IN_COLAB = 'COLAB_GPU' in os.environ or os.path.exists('/content')

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

    REPO_DIR = '/content/Driver-Activity-Recognition'
    if not os.path.exists(REPO_DIR):
        !git clone https://github.com/batuhne/Driver-Activity-Recognition.git {REPO_DIR}

    os.chdir(REPO_DIR)
    !pip install -q -r requirements.txt
    DATA_ROOT = '/content/drive/MyDrive/DriveAndAct'
else:
    DATA_ROOT = './data'

print(f'Working directory: {os.getcwd()}')
print(f'Data root: {DATA_ROOT}')

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.utils import load_config
from src.dataset import parse_annotations, get_dataloaders
from src.models import ActivityLSTM
from src.evaluate import evaluate_model, plot_confusion_matrix, plot_per_class_metrics

config = load_config()
if IN_COLAB:
    config['data']['root'] = DATA_ROOT
    # Load outputs from Drive
    drive_output = os.path.join(DATA_ROOT, 'results')
    config['output']['checkpoint_dir'] = os.path.join(drive_output, 'checkpoints')
    config['output']['log_dir'] = os.path.join(drive_output, 'logs')
    config['output']['figure_dir'] = os.path.join(drive_output, 'figures')

# Use fine-tuned features (must match what the model was trained on)
config['features']['save_dir'] = 'features_finetuned'

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## Load Best Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

checkpoint_path = os.path.join(config['output']['checkpoint_dir'], 'best_model.pth')
checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

idx_to_label = checkpoint['idx_to_label']
label_to_idx = checkpoint['label_to_idx']
num_classes = len(idx_to_label)

# Use saved config for model params (handles both old and new models)
saved_cfg = checkpoint.get('config', config)
model_cfg = saved_cfg.get('model', config['model'])

model = ActivityLSTM(
    input_dim=model_cfg.get('feature_dim', 512),
    hidden_dim=model_cfg.get('lstm_hidden', 256),
    num_layers=model_cfg.get('lstm_layers', 2),
    num_classes=num_classes,
    lstm_dropout=model_cfg.get('lstm_dropout', 0.3),
    fc_dropout=model_cfg.get('fc_dropout', 0.5),
    use_layernorm=model_cfg.get('use_layernorm', False),
    bidirectional=model_cfg.get('bidirectional', False),
    pooling=model_cfg.get('pooling', 'last'),
    noise_std=saved_cfg.get('training', {}).get('noise_std', 0.0),
).to(device)

model.load_state_dict(checkpoint['model_state_dict'])
print(f'Loaded model from epoch {checkpoint["epoch"]}')
print(f'Val Loss: {checkpoint["val_loss"]:.4f}, Val Acc: {checkpoint["val_acc"]:.4f}')
print(f'BiLSTM: {model_cfg.get("bidirectional", False)}, Pooling: {model_cfg.get("pooling", "last")}')

## Test Set Evaluation

In [None]:
loaders = get_dataloaders(config, feature_based=True)
metrics = evaluate_model(model, loaders['test'], device, idx_to_label, config)

## Comparison with Literature

In [None]:
# Reference results from Drive&Act paper (Martin et al., 2019)
# IR-only, mid-level activities
comparison = {
    'Method': ['I3D (IR)', 'ResNet-18 + LSTM (Ours)'],
    'Mean Per-Class Acc': [0.65, metrics['mean_per_class_acc']],
    'Overall Acc': [None, metrics['overall_acc']],
}

import pandas as pd
df_comp = pd.DataFrame(comparison)
print('\n=== Comparison with Literature ===')
print(df_comp.to_string(index=False))

## Error Analysis

In [None]:
# Identify most confused class pairs
from sklearn.metrics import confusion_matrix

model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for features, labels in loaders['test']:
        features = features.to(device)
        logits = model(features)
        all_preds.extend(logits.argmax(dim=1).cpu().numpy())
        all_labels.extend(labels.numpy())

cm = confusion_matrix(all_labels, all_preds, labels=range(num_classes))
np.fill_diagonal(cm, 0)  # Zero out correct predictions

# Top 10 most confused pairs
confused_pairs = []
for i in range(num_classes):
    for j in range(num_classes):
        if cm[i, j] > 0:
            confused_pairs.append((idx_to_label[i], idx_to_label[j], cm[i, j]))

confused_pairs.sort(key=lambda x: x[2], reverse=True)
print('Top 10 Most Confused Pairs (True -> Predicted, Count):')
for true_label, pred_label, count in confused_pairs[:10]:
    print(f'  {true_label} -> {pred_label}: {count}')

## Attention Weights Visualization
Visualize how the model attends to different timesteps when making predictions.

In [None]:
from src.evaluate import plot_attention_weights

if model_cfg.get('pooling', 'last') == 'attention':
    fig = plot_attention_weights(
        model, loaders['test'], device, idx_to_label,
        save_path=os.path.join(config['output']['figure_dir'], 'attention_weights.png'),
        num_samples=8,
    )
    if fig:
        print('Attention weights saved.')
else:
    print('Attention visualization requires pooling="attention". Current:', model_cfg.get('pooling', 'last'))