In [1]:
from pathlib import Path
import sys
import json
import pickle

import numpy as np
import pandas as pd

import torch
from PIL import Image
from transformers import CLIPModel, CLIPProcessor


def find_repo_root(start: Path) -> Path:
    cur = start.resolve()
    for p in [cur, *cur.parents]:
        if (p / "data_ingestion").exists() and (p / "utils").exists():
            return p
    return cur


ROOT = find_repo_root(Path())
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from utils.paths import CACHE_PATH  # noqa: E402

CACHE = CACHE_PATH
RAW_DIR = CACHE / "clip_multimodal" / "raw" / "cifar-10-batches-py"
OUTPUTS_DIR = ROOT / "outputs" / "clip_multimodal"
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

  from .autonotebook import tqdm as notebook_tqdm


'cpu'

In [19]:
if not RAW_DIR.exists():
    print("CIFAR-10 not found in cache; running ingestion...")
    # This populates: .cache/clip_multimodal/raw/cifar-10-batches-py
    # and writes: .cache/clip_multimodal/label_texts.json
    !python data_ingestion/clip_multimodal/run.py

assert RAW_DIR.exists(), f"Missing: {RAW_DIR}"

label_texts_path = CACHE / "clip_multimodal" / "label_texts.json"
if label_texts_path.exists():
    obj = json.loads(label_texts_path.read_text(encoding="utf-8"))
    # In this project we store either a list of strings or a dict like {"labels": [...]}
    if isinstance(obj, list):
        label_list = obj
    elif isinstance(obj, dict) and "labels" in obj:
        label_list = obj["labels"]
    elif isinstance(obj, dict) and "label_texts" in obj:
        label_list = obj["label_texts"]
    else:
        raise ValueError(f"Unexpected label_texts.json format: {type(obj)} keys={list(obj) if isinstance(obj, dict) else None}")
else:
    # Fallback (CIFAR-10 class names)
    label_list = [
        "airplane","automobile","bird","cat","deer",
        "dog","frog","horse","ship","truck",
    ]

assert isinstance(label_list, list) and all(isinstance(x, str) for x in label_list)
labels = label_list
labels

['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']

In [20]:
# Load CIFAR-10 test split (python batches)
test_batch_path = RAW_DIR / "test_batch"
with open(test_batch_path, "rb") as f:
    batch = pickle.load(f, encoding="bytes")

# images: (N, 32, 32, 3), uint8
images = batch[b"data"].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
y_true = np.array(batch[b"labels"], dtype=np.int64)

len(images), images.shape, y_true[:10]

  batch = pickle.load(f, encoding="bytes")


(10000, (10000, 32, 32, 3), array([3, 8, 8, 0, 6, 6, 1, 6, 3, 1]))

In [21]:
model_id = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_id).to(device)
processor = CLIPProcessor.from_pretrained(model_id)

texts = [f"a photo of a {lbl}" for lbl in labels]
text_inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True)
text_inputs = {k: v.to(device) for k, v in text_inputs.items()}

with torch.no_grad():
    text_out = model.text_model(
        input_ids=text_inputs["input_ids"],
        attention_mask=text_inputs.get("attention_mask"),
    )
    text_features = model.text_projection(text_out.pooler_output)

text_features = text_features / text_features.norm(dim=-1, keepdim=True)

Loading weights: 100%|██████████| 398/398 [00:00<00:00, 476.87it/s, Materializing param=visual_projection.weight]                                
CLIPModel LOAD REPORT from: openai/clip-vit-base-patch32
Key                                  | Status     |  | 
-------------------------------------+------------+--+-
vision_model.embeddings.position_ids | UNEXPECTED |  | 
text_model.embeddings.position_ids   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [22]:
import time

N = 10_000  # set 10_000 for full CIFAR-10 test; lower for quick checks
batch_size = 64
log_every = 5  # batches

n = min(N, len(images))
preds: list[int] = []

t0 = time.perf_counter()
last_log_t = t0
n_batches = (n + batch_size - 1) // batch_size

for batch_i, start in enumerate(range(0, n, batch_size), start=1):
    end = min(start + batch_size, n)
    batch_imgs = [Image.fromarray(images[i]) for i in range(start, end)]

    img_inputs = processor(images=batch_imgs, return_tensors="pt")
    pixel_values = img_inputs["pixel_values"].to(device)

    with torch.no_grad():
        vision_out = model.vision_model(pixel_values=pixel_values)
        img_features = model.visual_projection(vision_out.pooler_output)

    img_features = img_features / img_features.norm(dim=-1, keepdim=True)
    logits = img_features @ text_features.T
    preds.extend(torch.argmax(logits, dim=-1).detach().cpu().tolist())

    if batch_i % log_every == 0 or end == n:
        now = time.perf_counter()
        elapsed = now - t0
        done = end
        rate = done / elapsed if elapsed > 0 else float("inf")
        remaining = (n - done) / rate if rate > 0 else float("inf")
        print(f"[{batch_i}/{n_batches}] {done}/{n} images | {rate:.1f} img/s | ETA {remaining/60:.1f} min")
        last_log_t = now

y_pred = np.array(preds, dtype=np.int64)
acc = float((y_pred == y_true[:n]).mean())
total_s = time.perf_counter() - t0
print(f"Done: acc={acc:.4f} | eval_images={n} | time={total_s/60:.2f} min")
acc

[5/157] 320/10000 images | 12.7 img/s | ETA 12.7 min
[10/157] 640/10000 images | 12.1 img/s | ETA 12.9 min
[15/157] 960/10000 images | 11.7 img/s | ETA 12.9 min
[20/157] 1280/10000 images | 11.7 img/s | ETA 12.4 min
[25/157] 1600/10000 images | 11.4 img/s | ETA 12.3 min
[30/157] 1920/10000 images | 11.5 img/s | ETA 11.7 min
[35/157] 2240/10000 images | 11.5 img/s | ETA 11.2 min
[40/157] 2560/10000 images | 11.6 img/s | ETA 10.7 min
[45/157] 2880/10000 images | 11.6 img/s | ETA 10.2 min
[50/157] 3200/10000 images | 11.6 img/s | ETA 9.8 min
[55/157] 3520/10000 images | 11.6 img/s | ETA 9.3 min
[60/157] 3840/10000 images | 11.7 img/s | ETA 8.8 min
[65/157] 4160/10000 images | 11.7 img/s | ETA 8.3 min
[70/157] 4480/10000 images | 11.7 img/s | ETA 7.9 min
[75/157] 4800/10000 images | 11.7 img/s | ETA 7.4 min
[80/157] 5120/10000 images | 11.6 img/s | ETA 7.0 min
[85/157] 5440/10000 images | 11.5 img/s | ETA 6.6 min
[90/157] 5760/10000 images | 11.5 img/s | ETA 6.1 min
[95/157] 6080/10000 ima

0.888

In [24]:
metrics = {
    "pipeline": "clip_multimodal",
    "model_id": model_id,
    "num_eval": int(n),
    "top1_accuracy": acc,
    "labels": labels,
    "prompt_template": "a photo of a {label}",
}

(OUTPUTS_DIR / "evaluation").mkdir(parents=True, exist_ok=True)
metrics_path = OUTPUTS_DIR / "evaluation" / "metrics.json"
metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8")

rows = []
for i in range(min(50, n)):
    rows.append({
        "idx": i,
        "true_id": int(y_true[i]),
        "true_label": labels[int(y_true[i])],
        "pred_id": int(y_pred[i]),
        "pred_label": labels[int(y_pred[i])],
    })

report_df = pd.DataFrame(rows)
report_csv = OUTPUTS_DIR / "evaluation" / "sample_predictions.csv"
report_df.to_csv(report_csv, index=False)

metrics_path, report_csv

(PosixPath('/home/mateusz/dev/pjatk_zum/outputs/clip_multimodal/evaluation/metrics.json'),
 PosixPath('/home/mateusz/dev/pjatk_zum/outputs/clip_multimodal/evaluation/sample_predictions.csv'))

In [None]:
# (Optional) Zip outputs for sharing (e.g., Colab -> download)
!zip -r clip_multimodal_outputs.zip outputs/clip_multimodal