# Evaluate Deformable DETR on CommonForms Test Split

This notebook evaluates your trained Deformable DETR model using COCO-style mAP metrics:
- mAP @ IoU 0.5:0.95 (primary metric)
- mAP @ IoU 0.5
- mAP @ IoU 0.75
- Per-class AP scores

**What you need:**
1. Your Hugging Face model ID (e.g., `your-username/deformable-detr-commonforms`)
2. Dataset name (e.g., `jbarrow/CommonForms`)
3. Test split name (e.g., `test` or `validation`)

## 1. Install Dependencies

In [None]:
!pip install -q transformers datasets pillow torch torchvision pycocotools accelerate huggingface_hub

## 2. Authentication (Required for Private Models)

In [None]:
# Authenticate with Hugging Face for private models
# Option 1: Using Colab Secrets (Recommended)
# - Add HF_TOKEN to Colab Secrets (key icon in left sidebar)
# Option 2: Direct login (uncomment the notebook_login line below)

try:
    from google.colab import userdata
    import os

    HF_TOKEN = userdata.get('HF_TOKEN')
    os.environ['HF_TOKEN'] = HF_TOKEN

    from huggingface_hub import login
    login(token=HF_TOKEN)
    print("✓ Successfully authenticated with Hugging Face using Colab Secrets")
except:
    print("⚠️  HF_TOKEN not found in Colab Secrets")
    print("For private models, either:")
    print("  1. Add HF_TOKEN to Colab Secrets, or")
    print("  2. Uncomment the line below to login manually")
    
    # Uncomment this line to login manually:
    # from huggingface_hub import notebook_login
    # notebook_login()

## 3. Configuration

In [None]:
# ========== CONFIGURATION ==========
# Change these to match your setup

MODEL_ID = "your-username/deformable-detr-commonforms"  # Your HuggingFace model
DATASET_NAME = "jbarrow/CommonForms"  # Dataset name
TEST_SPLIT = "test"  # or "validation" if no test split exists

# Optional: limit number of samples for quick testing
MAX_SAMPLES = None  # Set to e.g., 100 for quick test, None for full evaluation

# Batch size for evaluation
BATCH_SIZE = 8  # Adjust based on your GPU memory

## 3. Load Model and Processor

In [None]:
from datasets import load_dataset

print(f"Loading dataset {DATASET_NAME}, split: {TEST_SPLIT}...")

# Only download the test split (not train/val)
# Use streaming=False but only load the specific split
try:
    dataset = load_dataset(
        DATASET_NAME, 
        split=TEST_SPLIT,
        # Optional: use streaming to avoid downloading everything
        # streaming=True if you want to avoid full download
    )
except Exception as e:
    print(f"Error loading split '{TEST_SPLIT}': {e}")
    print("\nAvailable splits:")
    info = load_dataset(DATASET_NAME, split=None)
    print(info.keys())
    raise

if MAX_SAMPLES is not None:
    dataset = dataset.select(range(min(MAX_SAMPLES, len(dataset))))
    print(f"Using first {len(dataset)} samples for evaluation")
else:
    print(f"Using full {TEST_SPLIT} split: {len(dataset)} samples")

print(f"\nDataset features: {dataset.features}")

## 4. Load Dataset

In [None]:
from datasets import load_dataset

print(f"Loading dataset {DATASET_NAME}, split: {TEST_SPLIT}...")
dataset = load_dataset(DATASET_NAME, split=TEST_SPLIT)

if MAX_SAMPLES is not None:
    dataset = dataset.select(range(min(MAX_SAMPLES, len(dataset))))
    print(f"Using first {len(dataset)} samples for evaluation")
else:
    print(f"Using full dataset: {len(dataset)} samples")

print(f"\nDataset features: {dataset.features}")

## 5. Helper Functions for Preprocessing

In [None]:
from tqdm.auto import tqdm

print("Running inference on test set...")
predictions = {}

# Process images one at a time to avoid batching issues with variable sizes
with torch.no_grad():
    for idx in tqdm(range(len(dataset)), desc="Inference"):
        sample = dataset[idx]
        image = sample["image"]
        image_id = sample.get("id", idx)
        
        # Preprocess single image
        inputs = processor(images=image, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Run model
        outputs = model(**inputs)
        
        # Post-process predictions
        # Get actual image size
        if isinstance(image, Image.Image):
            target_size = torch.tensor([image.size[::-1]]).to(device)
        else:
            target_size = torch.tensor([image.shape[:2]]).to(device)
        
        results = processor.post_process_object_detection(
            outputs,
            threshold=0.0,  # Keep all predictions for mAP calculation
            target_sizes=target_size,
        )
        
        # Store predictions (results is a list with one element)
        predictions[image_id] = results[0]

print(f"✓ Generated predictions for {len(predictions)} images")

## 6. Run Inference on Test Set

In [None]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

def collate_fn(batch):
    """Custom collate function for batching."""
    images = [item["image"] for item in batch]
    image_ids = [item.get("id", idx) for idx, item in enumerate(batch)]
    return images, image_ids

# Create dataloader
dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=2,
)

print("Running inference on test set...")
predictions = {}

with torch.no_grad():
    for images, image_ids in tqdm(dataloader, desc="Inference"):
        # Preprocess images
        inputs = processor(images=images, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Run model
        outputs = model(**inputs)
        
        # Post-process predictions
        target_sizes = torch.tensor([img.size[::-1] for img in images]).to(device)
        results = processor.post_process_object_detection(
            outputs,
            threshold=0.0,  # Keep all predictions for mAP calculation
            target_sizes=target_sizes,
        )
        
        # Store predictions
        for image_id, result in zip(image_ids, results):
            predictions[image_id] = result

print(f"✓ Generated predictions for {len(predictions)} images")

## 7. Prepare COCO Format and Calculate mAP

In [None]:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import json

# Prepare ground truth in COCO format
print("Preparing ground truth annotations...")
coco_images, coco_annotations = prepare_ground_truth(dataset)

# Get category names if available
if hasattr(dataset.features["objects"].feature["category"], "names"):
    category_names = dataset.features["objects"].feature["category"].names
    categories = [
        {"id": i, "name": name} for i, name in enumerate(category_names)
    ]
else:
    # Use generic category names
    num_categories = max([ann["category_id"] for ann in coco_annotations]) + 1
    categories = [
        {"id": i, "name": f"class_{i}"} for i in range(num_categories)
    ]

# Create COCO ground truth object
coco_gt = COCO()
coco_gt.dataset = {
    "images": coco_images,
    "annotations": coco_annotations,
    "categories": categories,
}
coco_gt.createIndex()

print(f"✓ Ground truth: {len(coco_images)} images, {len(coco_annotations)} annotations")

# Prepare predictions in COCO format
print("Preparing predictions...")
coco_results = prepare_for_coco_detection(predictions)
print(f"✓ Predictions: {len(coco_results)} detections")

# Run COCO evaluation
if len(coco_results) == 0:
    print("⚠️  No predictions generated! Model may not be detecting any objects.")
else:
    print("\nCalculating COCO metrics...")
    coco_dt = coco_gt.loadRes(coco_results)
    coco_eval = COCOeval(coco_gt, coco_dt, "bbox")
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()

## 8. Per-Class Average Precision

In [None]:
if len(coco_results) > 0:
    print("\n" + "=" * 80)
    print("Per-Class Average Precision (AP @ IoU 0.5:0.95)")
    print("=" * 80)
    
    # Calculate per-class AP
    per_class_ap = {}
    
    for cat_id, cat_info in enumerate(categories):
        cat_name = cat_info["name"]
        
        # Run evaluation for this category only
        coco_eval_cat = COCOeval(coco_gt, coco_dt, "bbox")
        coco_eval_cat.params.catIds = [cat_id]
        coco_eval_cat.evaluate()
        coco_eval_cat.accumulate()
        
        # Get AP @ IoU 0.5:0.95
        ap = coco_eval_cat.stats[0]  # AP @ IoU 0.5:0.95
        per_class_ap[cat_name] = ap
        
        print(f"  {cat_name:30s}: {ap:.4f}")
    
    print("=" * 80)

## 9. Summary Statistics

In [None]:
if len(coco_results) > 0:
    print("\n" + "=" * 80)
    print("EVALUATION SUMMARY")
    print("=" * 80)
    
    print(f"\nModel: {MODEL_ID}")
    print(f"Dataset: {DATASET_NAME} ({TEST_SPLIT} split)")
    print(f"Number of test samples: {len(dataset)}")
    print(f"Number of ground truth annotations: {len(coco_annotations)}")
    print(f"Number of predictions: {len(coco_results)}")
    
    print("\n" + "-" * 80)
    print("COCO Metrics:")
    print("-" * 80)
    
    metric_names = [
        "Average Precision (AP) @ IoU=0.50:0.95",
        "Average Precision (AP) @ IoU=0.50",
        "Average Precision (AP) @ IoU=0.75",
        "Average Precision (AP) @ IoU=0.50:0.95 (small)",
        "Average Precision (AP) @ IoU=0.50:0.95 (medium)",
        "Average Precision (AP) @ IoU=0.50:0.95 (large)",
        "Average Recall (AR) @ IoU=0.50:0.95 (max 1 det)",
        "Average Recall (AR) @ IoU=0.50:0.95 (max 10 det)",
        "Average Recall (AR) @ IoU=0.50:0.95 (max 100 det)",
        "Average Recall (AR) @ IoU=0.50:0.95 (small)",
        "Average Recall (AR) @ IoU=0.50:0.95 (medium)",
        "Average Recall (AR) @ IoU=0.50:0.95 (large)",
    ]
    
    for name, value in zip(metric_names, coco_eval.stats):
        print(f"{name:50s}: {value:.4f}")
    
    print("\n" + "=" * 80)
    print("\n✓ Evaluation complete!")
else:
    print("\n⚠️  Cannot calculate metrics - no predictions generated")

## 10. Visualize Sample Predictions (Optional)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import random

def visualize_predictions(dataset, predictions, num_samples=3, score_threshold=0.5):
    """Visualize predictions on sample images."""
    indices = random.sample(range(len(dataset)), min(num_samples, len(dataset)))
    
    for idx in indices:
        sample = dataset[idx]
        image_id = sample.get("id", idx)
        image = sample["image"]
        
        # Get predictions for this image
        pred = predictions.get(image_id, {})
        
        # Filter by score threshold
        if len(pred) > 0:
            scores = pred["scores"]
            mask = scores > score_threshold
            boxes = pred["boxes"][mask]
            labels = pred["labels"][mask]
            scores = scores[mask]
        else:
            boxes = torch.tensor([])
            labels = torch.tensor([])
            scores = torch.tensor([])
        
        # Create plot
        fig, ax = plt.subplots(1, figsize=(12, 8))
        ax.imshow(image)
        
        # Draw predictions
        for box, label, score in zip(boxes, labels, scores):
            x1, y1, x2, y2 = box.tolist()
            width = x2 - x1
            height = y2 - y1
            
            # Draw box
            rect = patches.Rectangle(
                (x1, y1), width, height,
                linewidth=2, edgecolor='red', facecolor='none'
            )
            ax.add_patch(rect)
            
            # Add label
            label_text = f"{categories[label.item()]['name']}: {score:.2f}"
            ax.text(
                x1, y1 - 5, label_text,
                color='white', fontsize=10,
                bbox=dict(facecolor='red', alpha=0.7)
            )
        
        ax.axis('off')
        plt.title(f"Image ID: {image_id} | Predictions: {len(boxes)}")
        plt.tight_layout()
        plt.show()

# Visualize some predictions
if len(coco_results) > 0:
    print("Visualizing sample predictions (red boxes = predictions with score > 0.5)...\n")
    visualize_predictions(dataset, predictions, num_samples=3, score_threshold=0.5)
else:
    print("No predictions to visualize")

## 11. Export Results (Optional)

In [None]:
# Save results to JSON file
if len(coco_results) > 0:
    results_dict = {
        "model_id": MODEL_ID,
        "dataset": DATASET_NAME,
        "split": TEST_SPLIT,
        "num_samples": len(dataset),
        "metrics": {
            "mAP_50_95": float(coco_eval.stats[0]),
            "mAP_50": float(coco_eval.stats[1]),
            "mAP_75": float(coco_eval.stats[2]),
            "mAP_small": float(coco_eval.stats[3]),
            "mAP_medium": float(coco_eval.stats[4]),
            "mAP_large": float(coco_eval.stats[5]),
        },
        "per_class_ap": per_class_ap,
    }
    
    with open("evaluation_results.json", "w") as f:
        json.dump(results_dict, f, indent=2)
    
    print("✓ Results saved to evaluation_results.json")
    
    # Download the file
    from google.colab import files
    files.download("evaluation_results.json")