# VLM Evaluation Framework - Quick Start

This notebook demonstrates how to use the VLM evaluation framework with a simple CNN encoder and dummy dataset.

**No external data or pretrained weights required!**

## 1. Setup and Imports

In [None]:
import sys
from pathlib import Path

# Add project to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np

# Import framework components
from vlm_eval import EncoderRegistry, HeadRegistry, DatasetRegistry
from vlm_eval.encoders import SimpleCNNEncoder
from vlm_eval.heads import LinearProbeHead
from vlm_eval.datasets import DummyDataset

print("✓ Imports successful!")

## 2. Create Encoder

The framework uses a registry system for easy model creation.

In [None]:
# List available encoders
print("Available encoders:", EncoderRegistry.list_available())

# Create encoder
encoder = EncoderRegistry.get("simple_cnn", variant="base", pretrained=False)

print(f"\nEncoder: {encoder.__class__.__name__}")
print(f"Output channels: {encoder.output_channels}")
print(f"Patch size: {encoder.patch_size}")
print(f"Parameters: {encoder.get_num_parameters():,}")

## 3. Create Segmentation Head

The head takes encoder features and produces segmentation predictions.

In [None]:
# List available heads
print("Available heads:", HeadRegistry.list_available())

# Create head
head = HeadRegistry.get(
    "linear_probe",
    encoder=encoder,
    num_classes=21,
    freeze_encoder=False
)

print(f"\nHead: {head.__class__.__name__}")
print(f"Total parameters: {head.get_num_parameters():,}")
print(f"Trainable parameters: {head.get_num_parameters(trainable_only=True):,}")
print(f"Head-only parameters: {head.get_head_parameters():,}")

## 4. Create Dataset

For this demo, we use a dummy dataset that generates random images and masks.

In [None]:
# List available datasets
print("Available datasets:", DatasetRegistry.list_available())

# Create dataset
dataset = DatasetRegistry.get(
    "dummy",
    num_samples=50,
    image_size=224,
    num_classes=21
)

print(f"\nDataset: {dataset.__class__.__name__}")
print(f"Number of samples: {len(dataset)}")
print(f"Number of classes: {dataset.num_classes}")
print(f"Class names: {dataset.class_names[:5]}...")  # Show first 5

## 5. Visualize Sample Data

In [None]:
# Get a sample
sample = dataset[0]
image = sample["image"]
mask = sample["mask"]

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Image
axes[0].imshow(image.permute(1, 2, 0).numpy())
axes[0].set_title(f"Image: {sample['filename']}")
axes[0].axis('off')

# Mask
axes[1].imshow(mask.numpy(), cmap='tab20')
axes[1].set_title("Segmentation Mask")
axes[1].axis('off')

plt.tight_layout()
plt.show()

print(f"Image shape: {image.shape}")
print(f"Mask shape: {mask.shape}")
print(f"Unique classes in mask: {mask.unique().tolist()}")

## 6. Create DataLoader

In [None]:
dataloader = DataLoader(
    dataset,
    batch_size=4,
    shuffle=True,
    num_workers=0
)

print(f"DataLoader created with batch_size=4")
print(f"Number of batches: {len(dataloader)}")

## 7. Run Forward Pass

In [None]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to device
head = head.to(device)
head.eval()

# Get one batch
batch = next(iter(dataloader))
images = batch["image"].to(device)
masks = batch["mask"].to(device)

print(f"\nInput shapes:")
print(f"  Images: {images.shape}")
print(f"  Masks: {masks.shape}")

# Forward pass
with torch.no_grad():
    features = encoder(images)
    logits = head(features)
    predictions = logits.argmax(dim=1)

print(f"\nOutput shapes:")
print(f"  Features: {features.shape}")
print(f"  Logits: {logits.shape}")
print(f"  Predictions: {predictions.shape}")

## 8. Visualize Predictions

In [None]:
# Visualize first sample in batch
idx = 0

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Input image
axes[0].imshow(images[idx].cpu().permute(1, 2, 0).numpy())
axes[0].set_title("Input Image")
axes[0].axis('off')

# Ground truth mask
axes[1].imshow(masks[idx].cpu().numpy(), cmap='tab20')
axes[1].set_title("Ground Truth Mask")
axes[1].axis('off')

# Prediction
axes[2].imshow(predictions[idx].cpu().numpy(), cmap='tab20')
axes[2].set_title("Prediction (Random - Untrained)")
axes[2].axis('off')

plt.tight_layout()
plt.show()

print("Note: Predictions are random since the model is untrained.")

## 9. Configuration-Based Usage

The framework supports YAML configuration files for easy experiment management.

In [None]:
import yaml
from vlm_eval.core.config import ExperimentConfig

# Load configuration
config_path = project_root / "configs" / "experiments" / "demo_simple_cnn.yaml"

with open(config_path) as f:
    config_dict = yaml.safe_load(f)

# Create config object
config = ExperimentConfig(**config_dict)

print("Experiment configuration:")
print(f"  Name: {config.name}")
print(f"  Encoder: {config.encoder.name} ({config.encoder.variant})")
print(f"  Head: {config.head.name}")
print(f"  Dataset: {config.dataset.name}")
print(f"  Batch size: {config.inference.batch_size}")
print(f"  Device: {config.inference.device}")

# Create models from config
encoder_from_config = EncoderRegistry.from_config(config.encoder.model_dump())
head_from_config = HeadRegistry.from_config(
    config.head.model_dump(),
    encoder=encoder_from_config
)

print("\n✓ Models created from configuration!")

## Summary

You've successfully:
1. ✅ Created an encoder using the registry system
2. ✅ Created a segmentation head
3. ✅ Loaded a dataset
4. ✅ Run forward passes
5. ✅ Visualized predictions
6. ✅ Used configuration files

### Next Steps

1. **Implement real encoders**: Add RADIO, DINOv2, or CLIP encoders
2. **Add real datasets**: Implement Pascal VOC, ADE20K, or Cityscapes
3. **Training**: Add training loops and optimization
4. **Evaluation**: Implement mIoU and other metrics
5. **Experiments**: Run full evaluation pipelines

Check out the other notebooks for more examples!