In [1]:
import torch, numpy as np
from PIL import Image
from tqdm import tqdm
from torchvision.transforms import ToPILImage

# make sure Python sees your local modelvshuman package
import sys
sys.path.append(".")

# imports from your code
from modelvshuman.models.wrappers.pytorch import HyCoCLIPModel
from modelvshuman.utils import load_dataset
from hycoclip.lorentz import pairwise_dist
from modelvshuman.models.pytorch.clip.imagenet_classes import imagenet_classes


  from .autonotebook import tqdm as notebook_tqdm







In [2]:
# Adjust args to match your factory signature
model = HyCoCLIPModel("hycoclip", "cue-conflict")
# if your wrapper doesn’t auto-load, do:
# ckpt = torch.load("path/to/hycoclip_vit_s.pth", map_location="cpu", weights_only=False)
# model.model.load_state_dict(ckpt.get("model", ckpt), strict=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.model.to(device)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


[1] checkpoint load: 84,315,215/85,290,372 nonzero (98.86% of params)
[DEBUG] zeroshot_weights: torch.Size([512, 1000]) min/max: -0.2458825409412384 0.24609914422035217 mean: 0.0002346319961361587
    first 10 norms: [1.0, 0.9999999403953552, 0.9999999403953552, 0.9999999403953552, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9999999403953552]


HyCoCLIP(
  (visual): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (patch_drop): Identity()
    (norm_pre): Identity()
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=384, out_features=1152, bias=True)
          (q_norm): Identity()
          (k_norm): Identity()
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): Identity()
        (drop_path1): Identity()
        (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (act): GELU(approximate='none')
  

In [3]:
ds = load_dataset("cue-conflict", split="val")
loader = ds.loader  # DataLoader instance

In [4]:
# [1] checkpoint parameters
params  = list(model.model.parameters())
tot     = sum(p.numel() for p in params)
nz      = sum((p.abs()>1e-6).sum().item() for p in params)
print("loaded params:", nz, "/", tot, f"({100*nz/tot:.1f}%)")

# [2] zeroshot weights
zs = model.zeroshot_weights
print("zeroshot_weights:", zs.shape,
      "min/max/mean:", zs.min(), zs.max(), zs.mean())
print("first norms:", zs.norm(dim=0)[:10])

loaded params: 84315215 / 85290372 (98.9%)
zeroshot_weights: torch.Size([512, 1000]) min/max/mean: tensor(-0.2459, device='cuda:0') tensor(0.2461, device='cuda:0') tensor(0.0002, device='cuda:0')
first norms: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000], device='cuda:0')


In [5]:
# grab one batch
imgs, targets, _ = next(iter(loader))
print("raw:", imgs.shape, imgs.min(), imgs.max())

proc = model.preprocess()
pimgs = [proc(ToPILImage()(im)) for im in imgs[:4]]
batch = torch.stack(pimgs)
print("pre:", batch.shape, batch.min(), batch.max())

raw: torch.Size([16, 3, 224, 224]) tensor(-2.1179) tensor(2.6400)
pre: torch.Size([4, 3, 224, 224]) tensor(-2.1179) tensor(2.6400)


In [6]:
logits = model.forward_batch(imgs[:4].to(device))
print("logits:", logits.shape,
      "min/max/mean:", logits.min(), logits.max(), logits.mean())

top5 = np.argsort(-logits, axis=1)[:4,:5]
for i in range(4):
    print(f"sample {i} top5:", top5[i])

[DEBUG] raw batch range: -2.1179039478302 2.640000104904175
[DEBUG] preprocessed shape: torch.Size([4, 3, 224, 224]) range: -2.1179039478302 2.640000104904175
[DEBUG] logits shape: torch.Size([4, 1000]) min/max: tensor(-113.2486, device='cuda:0') tensor(-75.9184, device='cuda:0')
logits: (4, 1000) min/max/mean: -113.24858 -75.91841 -96.67283
sample 0 top5: [489 632 545 490 999]
sample 1 top5: [920 781 566 727 645]
sample 2 top5: [919 489 475 920 727]
sample 3 top5: [489 490 581 919 545]


In [8]:
print(imagenet_classes)

['tench', 'goldfish', 'great white shark', 'tiger shark', 'hammerhead shark', 'electric ray', 'stingray', 'rooster', 'hen', 'ostrich', 'brambling', 'goldfinch', 'house finch', 'junco', 'indigo bunting', 'American robin', 'bulbul', 'jay', 'magpie', 'chickadee', 'American dipper', 'kite (bird of prey)', 'bald eagle', 'vulture', 'great grey owl', 'fire salamander', 'smooth newt', 'newt', 'spotted salamander', 'axolotl', 'American bullfrog', 'tree frog', 'tailed frog', 'loggerhead sea turtle', 'leatherback sea turtle', 'mud turtle', 'terrapin', 'box turtle', 'banded gecko', 'green iguana', 'Carolina anole', 'desert grassland whiptail lizard', 'agama', 'frilled-necked lizard', 'alligator lizard', 'Gila monster', 'European green lizard', 'chameleon', 'Komodo dragon', 'Nile crocodile', 'American alligator', 'triceratops', 'worm snake', 'ring-necked snake', 'eastern hog-nosed snake', 'smooth green snake', 'kingsnake', 'garter snake', 'water snake', 'vine snake', 'night snake', 'boa constrictor

In [10]:
probs = np.zeros((1, logits.shape[1]), dtype=np.float32)
idx   = imagenet_classes.index("hen")
probs[0, idx] = 1.0
print("mapped hen->", ds.decision_mapping(probs)[0])

mapped hen-> ['bird' 'truck' 'knife' 'oven' 'elephant' 'dog' 'clock' 'keyboard' 'chair'
 'cat' 'bottle' 'car' 'boat' 'bicycle' 'bear' 'airplane']


In [11]:
print("=== ImageNet top-5 per sample ===")
for i in range(min(4, logits.shape[0])):
    top5_idx = np.argsort(-logits[i])[:5]
    top5_names = [imagenet_classes[j] for j in top5_idx]
    print(f"sample {i:2d}:", top5_names)

=== ImageNet top-5 per sample ===
sample  0: ['chain-link fence', 'music speaker', 'electric fan', 'chain mail', 'toilet paper']
sample  1: ['traffic light', 'scoreboard', 'French horn', 'planetarium', 'maypole']
sample  2: ['traffic or street sign', 'chain-link fence', 'car mirror', 'traffic light', 'planetarium']
sample  3: ['chain-link fence', 'chain mail', 'radiator grille', 'traffic or street sign', 'electric fan']


In [12]:
mapped = ds.decision_mapping(logits)  # → array of shape [B]
print("\n=== Mapped cue-conflict predictions ===")
for i in range(min(4, len(mapped))):
    label_idx  = mapped[i]
    label_name = ds.loader.dataset.classes[label_idx]
    print(f"sample {i:2d}: {label_idx} → {label_name}")

AssertionError: 

In [None]:
img = Image.open("cue-conflict/val/airplane/XYZ.jpg").convert("RGB")
inp = proc(img).unsqueeze(0).to(device)
log = model.forward_batch(inp)
pred0 = int(log.argmax())
print("Imagenet top1:", pred0, imagenet_classes[pred0])
print("Mapped cue-conflict:", ds.decision_mapping(log)[0],
      ds.loader.dataset.classes[ds.decision_mapping(log)[0]])