In [None]:
# CLIP (multimodal) â€“ evaluation notebook
# Goal: evaluate a pretrained CLIP model on CIFAR-10 from our cache.

from __future__ import annotations

from pathlib import Path
import sys
import json
import pickle

import numpy as np
import pandas as pd

import torch
from PIL import Image
from transformers import CLIPModel, CLIPProcessor


def find_repo_root(start: Path) -> Path:
    cur = start.resolve()
    for p in [cur, *cur.parents]:
        if (p / "data_ingestion").exists() and (p / "utils").exists():
            return p
    return cur


ROOT = find_repo_root(Path())
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from utils.paths import CACHE_PATH  # noqa: E402

CACHE = CACHE_PATH
RAW_DIR = CACHE / "clip_multimodal" / "raw" / "cifar-10-batches-py"
OUTPUTS_DIR = ROOT / "outputs" / "clip_multimodal"
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
# Ensure dataset is present in cache (runs ingestion if needed)
if not RAW_DIR.exists():
    print("CIFAR-10 not found in cache; running ingestion...")
    # This populates: .cache/clip_multimodal/raw/cifar-10-batches-py
    # and writes: .cache/clip_multimodal/label_texts.json
    !python data_ingestion/clip_multimodal/run.py

assert RAW_DIR.exists(), f"Missing: {RAW_DIR}"

label_texts_path = CACHE / "clip_multimodal" / "label_texts.json"
if label_texts_path.exists():
    labels = json.loads(label_texts_path.read_text(encoding="utf-8"))
else:
    # Fallback (CIFAR-10 class names)
    labels = [
        "airplane","automobile","bird","cat","deer",
        "dog","frog","horse","ship","truck",
    ]

labels

In [None]:
# Load CIFAR-10 test split (python batches)
test_batch_path = RAW_DIR / "test_batch"
with open(test_batch_path, "rb") as f:
    batch = pickle.load(f, encoding="bytes")

# images: (N, 32, 32, 3), uint8
images = batch[b"data"].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
y_true = np.array(batch[b"labels"], dtype=np.int64)

len(images), images.shape, y_true[:10]

In [None]:
# Load pretrained CLIP and precompute text features
model_id = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_id).to(device)
processor = CLIPProcessor.from_pretrained(model_id)

texts = [f"a photo of a {lbl}" for lbl in labels]
text_inputs = processor(text=texts, return_tensors="pt", padding=True).to(device)
text_features = model.get_text_features(**text_inputs)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)

model_id, text_features.shape

In [None]:
# Evaluate top-1 accuracy on a subset (prompt retrieval)
N = 1000  # increase to 10_000 for full CIFAR-10 test
batch_size = 64

n = min(N, len(images))
preds: list[int] = []

for start in range(0, n, batch_size):
    end = min(start + batch_size, n)
    batch_imgs = [Image.fromarray(images[i]) for i in range(start, end)]
    img_inputs = processor(images=batch_imgs, return_tensors="pt").to(device)
    img_features = model.get_image_features(**img_inputs)
    img_features = img_features / img_features.norm(dim=-1, keepdim=True)
    logits = img_features @ text_features.T
    preds.extend(torch.argmax(logits, dim=-1).detach().cpu().tolist())

y_pred = np.array(preds, dtype=np.int64)
acc = float((y_pred == y_true[:n]).mean())
acc

In [None]:
# Save metrics + a small prediction report
metrics = {
    "pipeline": "clip_multimodal",
    "model_id": model_id,
    "num_eval": int(n),
    "top1_accuracy": acc,
    "labels": labels,
    "prompt_template": "a photo of a {label}",
}

(OUTPUTS_DIR / "evaluation").mkdir(parents=True, exist_ok=True)
metrics_path = OUTPUTS_DIR / "evaluation" / "metrics.json"
metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8")

rows = []
for i in range(min(50, n)):
    rows.append({
        "idx": i,
        "true_id": int(y_true[i]),
        "true_label": labels[int(y_true[i])],
        "pred_id": int(y_pred[i]),
        "pred_label": labels[int(y_pred[i])],
    })

report_df = pd.DataFrame(rows)
report_csv = OUTPUTS_DIR / "evaluation" / "sample_predictions.csv"
report_df.to_csv(report_csv, index=False)

metrics_path, report_csv

In [None]:
# (Optional) Zip outputs for sharing (e.g., Colab -> download)
!zip -r clip_multimodal_outputs.zip outputs/clip_multimodal