In [1]:
# Colab convenience: clone repo if needed
from pathlib import Path

if not Path("pyproject.toml").exists():
    if not Path("pjatk_zum").exists():
        !git clone https://github.com/beep1000101/pjatk_zum.git
    else:
        print("Repo folder already present: pjatk_zum")
else:
    print("Already in repo root (pyproject.toml found)")

# Colab convenience: cd into repo folder if we cloned it
from pathlib import Path

if Path("pyproject.toml").exists():
    print("Already in repo root")
elif Path("pjatk_zum").exists():
    %cd pjatk_zum
else:
    raise FileNotFoundError("Could not find repo root (pyproject.toml) or ./pjatk_zum")

Cloning into 'pjatk_zum'...
remote: Enumerating objects: 266, done.[K
remote: Counting objects: 100% (266/266), done.[K
remote: Compressing objects: 100% (143/143), done.[K
remote: Total 266 (delta 107), reused 238 (delta 85), pack-reused 0 (from 0)[K
Receiving objects: 100% (266/266), 189.85 KiB | 2.88 MiB/s, done.
Resolving deltas: 100% (107/107), done.
/content/pjatk_zum


In [2]:
from pathlib import Path
import sys
import json
import pickle

import numpy as np
import pandas as pd

import torch
from PIL import Image
from transformers import CLIPModel, CLIPProcessor


def find_repo_root(start: Path) -> Path:
    cur = start.resolve()
    for p in [cur, *cur.parents]:
        if (p / "data_ingestion").exists() and (p / "utils").exists():
            return p
    return cur


ROOT = find_repo_root(Path())
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from utils.paths import CACHE_PATH  # noqa: E402

CACHE = CACHE_PATH
RAW_DIR = CACHE / "clip_multimodal" / "raw" / "cifar-10-batches-py"
OUTPUTS_DIR = ROOT / "outputs" / "clip_multimodal"
OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
if not RAW_DIR.exists():
    print("CIFAR-10 not found in cache; running ingestion...")
    # This populates: .cache/clip_multimodal/raw/cifar-10-batches-py
    # and writes: .cache/clip_multimodal/label_texts.json
    !python data_ingestion/clip_multimodal/run.py

assert RAW_DIR.exists(), f"Missing: {RAW_DIR}"

label_texts_path = CACHE / "clip_multimodal" / "label_texts.json"
if label_texts_path.exists():
    obj = json.loads(label_texts_path.read_text(encoding="utf-8"))
    # In this project we store either a list of strings or a dict like {"labels": [...]}
    if isinstance(obj, list):
        label_list = obj
    elif isinstance(obj, dict) and "labels" in obj:
        label_list = obj["labels"]
    elif isinstance(obj, dict) and "label_texts" in obj:
        label_list = obj["label_texts"]
    else:
        raise ValueError(f"Unexpected label_texts.json format: {type(obj)} keys={list(obj) if isinstance(obj, dict) else None}")
else:
    # Fallback (CIFAR-10 class names)
    label_list = [
        "airplane","automobile","bird","cat","deer",
        "dog","frog","horse","ship","truck",
    ]

assert isinstance(label_list, list) and all(isinstance(x, str) for x in label_list)
labels = label_list
labels

CIFAR-10 not found in cache; running ingestion...
[clip_multimodal] Downloading: https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[clip_multimodal] Cache file: .cache/clip_multimodal/cifar-10-python.tar.gz
[clip_multimodal] Extracting into: .cache/clip_multimodal/raw
Wrote provenance: .cache/clip_multimodal/provenance.json


['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']

In [4]:
# Load CIFAR-10 test split (python batches)
test_batch_path = RAW_DIR / "test_batch"
with open(test_batch_path, "rb") as f:
    batch = pickle.load(f, encoding="bytes")

# images: (N, 32, 32, 3), uint8
images = batch[b"data"].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
y_true = np.array(batch[b"labels"], dtype=np.int64)

len(images), images.shape, y_true[:10]

(10000, (10000, 32, 32, 3), array([3, 8, 8, 0, 6, 6, 1, 6, 3, 1]))

In [5]:
model_id = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_id).to(device)
processor = CLIPProcessor.from_pretrained(model_id)

texts = [f"a photo of a {lbl}" for lbl in labels]
text_inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True)
text_inputs = {k: v.to(device) for k, v in text_inputs.items()}

with torch.no_grad():
    text_out = model.text_model(
        input_ids=text_inputs["input_ids"],
        attention_mask=text_inputs.get("attention_mask"),
    )
    text_features = model.text_projection(text_out.pooler_output)

text_features = text_features / text_features.norm(dim=-1, keepdim=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [6]:
import time

N = 10_000  # set 10_000 for full CIFAR-10 test; lower for quick checks
batch_size = 64
log_every = 5  # batches

n = min(N, len(images))
preds: list[int] = []

t0 = time.perf_counter()
last_log_t = t0
n_batches = (n + batch_size - 1) // batch_size

for batch_i, start in enumerate(range(0, n, batch_size), start=1):
    end = min(start + batch_size, n)
    batch_imgs = [Image.fromarray(images[i]) for i in range(start, end)]

    img_inputs = processor(images=batch_imgs, return_tensors="pt")
    pixel_values = img_inputs["pixel_values"].to(device)

    with torch.no_grad():
        vision_out = model.vision_model(pixel_values=pixel_values)
        img_features = model.visual_projection(vision_out.pooler_output)

    img_features = img_features / img_features.norm(dim=-1, keepdim=True)
    logits = img_features @ text_features.T
    preds.extend(torch.argmax(logits, dim=-1).detach().cpu().tolist())

    if batch_i % log_every == 0 or end == n:
        now = time.perf_counter()
        elapsed = now - t0
        done = end
        rate = done / elapsed if elapsed > 0 else float("inf")
        remaining = (n - done) / rate if rate > 0 else float("inf")
        print(f"[{batch_i}/{n_batches}] {done}/{n} images | {rate:.1f} img/s | ETA {remaining/60:.1f} min")
        last_log_t = now

y_pred = np.array(preds, dtype=np.int64)
acc = float((y_pred == y_true[:n]).mean())
total_s = time.perf_counter() - t0
print(f"Done: acc={acc:.4f} | eval_images={n} | time={total_s/60:.2f} min")
acc

[5/157] 320/10000 images | 206.4 img/s | ETA 0.8 min
[10/157] 640/10000 images | 255.2 img/s | ETA 0.6 min
[15/157] 960/10000 images | 276.5 img/s | ETA 0.5 min
[20/157] 1280/10000 images | 287.5 img/s | ETA 0.5 min
[25/157] 1600/10000 images | 296.1 img/s | ETA 0.5 min
[30/157] 1920/10000 images | 302.1 img/s | ETA 0.4 min
[35/157] 2240/10000 images | 306.4 img/s | ETA 0.4 min
[40/157] 2560/10000 images | 309.8 img/s | ETA 0.4 min
[45/157] 2880/10000 images | 312.4 img/s | ETA 0.4 min
[50/157] 3200/10000 images | 314.6 img/s | ETA 0.4 min
[55/157] 3520/10000 images | 316.3 img/s | ETA 0.3 min
[60/157] 3840/10000 images | 317.9 img/s | ETA 0.3 min
[65/157] 4160/10000 images | 319.4 img/s | ETA 0.3 min
[70/157] 4480/10000 images | 320.4 img/s | ETA 0.3 min
[75/157] 4800/10000 images | 321.3 img/s | ETA 0.3 min
[80/157] 5120/10000 images | 322.0 img/s | ETA 0.3 min
[85/157] 5440/10000 images | 322.7 img/s | ETA 0.2 min
[90/157] 5760/10000 images | 323.3 img/s | ETA 0.2 min
[95/157] 6080/

0.8879

In [7]:
metrics = {
    "pipeline": "clip_multimodal",
    "model_id": model_id,
    "num_eval": int(n),
    "top1_accuracy": acc,
    "labels": labels,
    "prompt_template": "a photo of a {label}",
}

(OUTPUTS_DIR / "evaluation").mkdir(parents=True, exist_ok=True)
metrics_path = OUTPUTS_DIR / "evaluation" / "metrics.json"
metrics_path.write_text(json.dumps(metrics, indent=2), encoding="utf-8")

rows = []
for i in range(min(50, n)):
    rows.append({
        "idx": i,
        "true_id": int(y_true[i]),
        "true_label": labels[int(y_true[i])],
        "pred_id": int(y_pred[i]),
        "pred_label": labels[int(y_pred[i])],
    })

report_df = pd.DataFrame(rows)
report_csv = OUTPUTS_DIR / "evaluation" / "sample_predictions.csv"
report_df.to_csv(report_csv, index=False)

metrics_path, report_csv

(PosixPath('/content/pjatk_zum/outputs/clip_multimodal/evaluation/metrics.json'),
 PosixPath('/content/pjatk_zum/outputs/clip_multimodal/evaluation/sample_predictions.csv'))

In [8]:
# (Optional) Zip outputs for sharing (e.g., Colab -> download)
!zip -r clip_multimodal_outputs.zip outputs/clip_multimodal

  adding: outputs/clip_multimodal/ (stored 0%)
  adding: outputs/clip_multimodal/evaluation/ (stored 0%)
  adding: outputs/clip_multimodal/evaluation/metrics.json (deflated 39%)
  adding: outputs/clip_multimodal/evaluation/sample_predictions.csv (deflated 73%)


In [11]:
# --- Evaluation: Run after training ---
from notebooks.clip_multimodal.helpers import run_inference, compute_metrics, load_cifar10_test
import json
from transformers import CLIPModel, CLIPProcessor
import torch
from pathlib import Path

# Load model and processor
model_id = "openai/clip-vit-base-patch32"  # or your fine-tuned model path
model = CLIPModel.from_pretrained(model_id).to(device)
processor = CLIPProcessor.from_pretrained(model_id)

# Load test set
images, y_true = load_cifar10_test()
labels = ["airplane","automobile","bird","cat","deer","dog","frog","horse","ship","truck"]

# Prepare text features
texts = [f"a photo of a {lbl}" for lbl in labels]
text_inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True)
text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
with torch.no_grad():
    text_out = model.text_model(
        input_ids=text_inputs["input_ids"],
        attention_mask=text_inputs.get("attention_mask"),
    )
    text_features = model.text_projection(text_out.pooler_output)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)

# Run inference and compute metrics
y_pred = run_inference(model, processor, device, images, text_features)
metrics = compute_metrics(y_true, y_pred, labels)

# Save metrics
outputs_dir = Path(ROOT) / "outputs" / "clip_multimodal"
(outputs_dir / "evaluation").mkdir(parents=True, exist_ok=True)
with open(outputs_dir / "evaluation" / 'metrics.json', 'w', encoding='utf-8') as f:
    json.dump(metrics, f, indent=2)
metrics

[5/157] 320/10000 images | 332.4 img/s | ETA 0.5 min
[10/157] 640/10000 images | 332.1 img/s | ETA 0.5 min
[15/157] 960/10000 images | 332.1 img/s | ETA 0.5 min
[20/157] 1280/10000 images | 332.1 img/s | ETA 0.4 min
[25/157] 1600/10000 images | 331.8 img/s | ETA 0.4 min
[30/157] 1920/10000 images | 331.6 img/s | ETA 0.4 min
[35/157] 2240/10000 images | 331.2 img/s | ETA 0.4 min
[40/157] 2560/10000 images | 331.0 img/s | ETA 0.4 min
[45/157] 2880/10000 images | 331.1 img/s | ETA 0.4 min
[50/157] 3200/10000 images | 331.1 img/s | ETA 0.3 min
[55/157] 3520/10000 images | 331.2 img/s | ETA 0.3 min
[60/157] 3840/10000 images | 331.2 img/s | ETA 0.3 min
[65/157] 4160/10000 images | 331.3 img/s | ETA 0.3 min
[70/157] 4480/10000 images | 331.3 img/s | ETA 0.3 min
[75/157] 4800/10000 images | 331.3 img/s | ETA 0.3 min
[80/157] 5120/10000 images | 331.4 img/s | ETA 0.2 min
[85/157] 5440/10000 images | 331.4 img/s | ETA 0.2 min
[90/157] 5760/10000 images | 331.3 img/s | ETA 0.2 min
[95/157] 6080/

{'top1_accuracy': 0.8879}