In [24]:
import torch, numpy as np
from PIL import Image
from tqdm import tqdm
from torchvision.transforms import ToPILImage

# make sure Python sees your local modelvshuman package
import sys
sys.path.append(".")

# imports from your code
from modelvshuman.models.wrappers.pytorch import HyCoCLIPModel
from modelvshuman.utils import load_dataset
from hycoclip.lorentz import pairwise_dist
from modelvshuman.models.pytorch.clip.imagenet_classes import imagenet_classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [2]:
# Adjust args to match your factory signature
model = HyCoCLIPModel("hycoclip", "cue-conflict")
# if your wrapper doesn’t auto-load, do:
# ckpt = torch.load("path/to/hycoclip_vit_s.pth", map_location="cpu", weights_only=False)
# model.model.load_state_dict(ckpt.get("model", ckpt), strict=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.model.to(device)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
HyCoCLIP zeroshot: 100%|██████████| 1000/1000 [01:35<00:00, 10.51it/s]


HyCoCLIP(
  (visual): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (patch_drop): Identity()
    (norm_pre): Identity()
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=384, out_features=1152, bias=True)
          (q_norm): Identity()
          (k_norm): Identity()
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): Identity()
        (drop_path1): Identity()
        (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (act): GELU(approximate='none')
  

In [3]:
ds = load_dataset("cue-conflict", split="val")
loader = ds.loader  # DataLoader instance

In [4]:
# [1] checkpoint parameters
params  = list(model.model.parameters())
tot     = sum(p.numel() for p in params)
nz      = sum((p.abs()>1e-6).sum().item() for p in params)
print("loaded params:", nz, "/", tot, f"({100*nz/tot:.1f}%)")

# [2] zeroshot weights
zs = model.zeroshot_weights
print("zeroshot_weights:", zs.shape,
      "min/max/mean:", zs.min(), zs.max(), zs.mean())
print("first norms:", zs.norm(dim=0)[:10])

loaded params: 84315215 / 85290372 (98.9%)
zeroshot_weights: torch.Size([512, 1000]) min/max/mean: tensor(-0.2459, device='cuda:0') tensor(0.2461, device='cuda:0') tensor(0.0002, device='cuda:0')
first norms: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000], device='cuda:0')


In [5]:
# grab one batch
imgs, targets, _ = next(iter(loader))
print("raw:", imgs.shape, imgs.min(), imgs.max())

proc = model.preprocess()
pimgs = [proc(ToPILImage()(im)) for im in imgs[:4]]
batch = torch.stack(pimgs)
print("pre:", batch.shape, batch.min(), batch.max())

raw: torch.Size([16, 3, 224, 224]) tensor(-2.1179) tensor(2.6400)
pre: torch.Size([4, 3, 224, 224]) tensor(-2.1179) tensor(2.6400)


In [6]:
logits = model.forward_batch(imgs[:4].to(device))
print("logits:", logits.shape,
      "min/max/mean:", logits.min(), logits.max(), logits.mean())

top5 = np.argsort(-logits, axis=1)[:4,:5]
for i in range(4):
    print(f"sample {i} top5:", top5[i])

logits: (4, 1000) min/max/mean: -113.24858 -75.91841 -96.67283
sample 0 top5: [489 632 545 490 999]
sample 1 top5: [920 781 566 727 645]
sample 2 top5: [919 489 475 920 727]
sample 3 top5: [489 490 581 919 545]


In [7]:
print(imagenet_classes)

['tench', 'goldfish', 'great white shark', 'tiger shark', 'hammerhead shark', 'electric ray', 'stingray', 'rooster', 'hen', 'ostrich', 'brambling', 'goldfinch', 'house finch', 'junco', 'indigo bunting', 'American robin', 'bulbul', 'jay', 'magpie', 'chickadee', 'American dipper', 'kite (bird of prey)', 'bald eagle', 'vulture', 'great grey owl', 'fire salamander', 'smooth newt', 'newt', 'spotted salamander', 'axolotl', 'American bullfrog', 'tree frog', 'tailed frog', 'loggerhead sea turtle', 'leatherback sea turtle', 'mud turtle', 'terrapin', 'box turtle', 'banded gecko', 'green iguana', 'Carolina anole', 'desert grassland whiptail lizard', 'agama', 'frilled-necked lizard', 'alligator lizard', 'Gila monster', 'European green lizard', 'chameleon', 'Komodo dragon', 'Nile crocodile', 'American alligator', 'triceratops', 'worm snake', 'ring-necked snake', 'eastern hog-nosed snake', 'smooth green snake', 'kingsnake', 'garter snake', 'water snake', 'vine snake', 'night snake', 'boa constrictor

In [8]:
probs = np.zeros((1, logits.shape[1]), dtype=np.float32)
idx   = imagenet_classes.index("hen")
probs[0, idx] = 1.0
print("mapped hen->", ds.decision_mapping(probs)[0])

mapped hen-> ['bird' 'truck' 'knife' 'oven' 'elephant' 'dog' 'clock' 'keyboard' 'chair'
 'cat' 'bottle' 'car' 'boat' 'bicycle' 'bear' 'airplane']


In [9]:
print("=== ImageNet top-5 per sample ===")
for i in range(min(4, logits.shape[0])):
    top5_idx = np.argsort(-logits[i])[:5]
    top5_names = [imagenet_classes[j] for j in top5_idx]
    print(f"sample {i:2d}:", top5_names)

=== ImageNet top-5 per sample ===
sample  0: ['chain-link fence', 'music speaker', 'electric fan', 'chain mail', 'toilet paper']
sample  1: ['traffic light', 'scoreboard', 'French horn', 'planetarium', 'maypole']
sample  2: ['traffic or street sign', 'chain-link fence', 'car mirror', 'traffic light', 'planetarium']
sample  3: ['chain-link fence', 'chain mail', 'radiator grille', 'traffic or street sign', 'electric fan']


In [20]:
import torch

# 1) turn to a tensor
logits_t = torch.from_numpy(logits)        # [B×1000], floats

# 2) softmax over the class dimension
probs_t = torch.softmax(logits_t, dim=1)   # each row sums to 1

# 3) back to NumPy
probs = probs_t.numpy()                    # still [B×1000]

# 4) now map
mapped = ds.decision_mapping(probs)
print("mapped:", type(mapped), "dtype:", mapped.dtype, "shape:", mapped.shape)
print("first row:", mapped[0])
print("second row:", mapped[1])

print("\n=== Mapped cue-conflict predictions ===")
top1_names = mapped[:, 0]    # a length-B array of the most likely class names

for i, name in enumerate(top1_names[:4]):
    print(f"sample {i:2d}: {name}")

mapped: <class 'numpy.ndarray'> dtype: <U8 shape: (4, 16)
first row: ['keyboard' 'clock' 'car' 'bird' 'bottle' 'dog' 'truck' 'chair' 'bear'
 'airplane' 'knife' 'bicycle' 'boat' 'cat' 'oven' 'elephant']
second row: ['bird' 'truck' 'clock' 'car' 'elephant' 'bear' 'dog' 'boat' 'chair'
 'keyboard' 'bottle' 'cat' 'oven' 'bicycle' 'knife' 'airplane']

=== Mapped cue-conflict predictions ===
sample  0: keyboard
sample  1: bird
sample  2: clock
sample  3: keyboard


In [26]:
hyco = model.model
hyco.eval()
with torch.no_grad():
    feats1 = hyco.encode_image(batch[:5].to(device), project=True)
    feats2 = hyco.encode_image(batch[:5].clone().to(device), project=True)
    print("identical? ", torch.allclose(feats1, feats2))
    # check variability
    d = (feats1[0] - feats1[1]).norm().item()
    print("feature distance between sample 0 & 1:", d)


identical?  True
feature distance between sample 0 & 1: 0.1923612803220749


In [42]:
import torch, gc
from modelvshuman.utils import load_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

gc.collect()
torch.cuda.empty_cache()


# 1) load the wrapped CLIP model
model_wrapper, _ = load_model("clip")

# 2) get the cue-conflict dataloader
ds = load_dataset("cue-conflict", batch_size=16, num_workers=16)
loader = ds.loader

model_wrapper.model.to(device)
model_wrapper.model.eval()



100%|██████████| 1000/1000 [03:05<00:00,  5.40it/s]


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [47]:
!python -m modelvshuman \
    -m hycoclip \
    -d cue-conflict \
    -b 64 \
    -w 8


Evaluating model hycoclip on dataset cue-conflict using Pytorch Evaluator








accuracy (top-1): 6.02




  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]

HyCoCLIP zeroshot:   0%|          | 0/1000 [00:00<?, ?it/s]
HyCoCLIP zeroshot:   0%|          | 1/1000 [00:00<04:36,  3.62it/s]
HyCoCLIP zeroshot:   0%|          | 3/1000 [00:00<02:26,  6.81it/s]
HyCoCLIP zeroshot:   0%|          | 4/1000 [00:00<02:11,  7.56it/s]
HyCoCLIP zeroshot:   0%|          | 5/1000 [00:00<02:01,  8.19it/s]
HyCoCLIP zeroshot:   1%|          | 6/1000 [00:00<01:56,  8.55it/s]
HyCoCLIP zeroshot:   1%|          | 7/1000 [00:00<01:52,  8.86it/s]
HyCoCLIP zeroshot:   1%|          | 8/1000 [00:01<01:49,  9.04it/s]
HyCoCLIP zeroshot:   1%|          | 9/1000 [00:01<01:47,  9.23it/s]
HyCoCLIP zeroshot:   1%|          | 10/1000 [00:01<01:45,  9.37it/s]
HyCoCLIP zeroshot:   1%|          | 11/1000 [00:01<01:45,  9.41it/s]
HyCoCLIP zeroshot:   1%|          | 12/1000 [00:01<01:44,  9.42it/s]
HyCoCLIP zeroshot:   1%|▏         | 13/1000 [00:01<01:44,  9.43it/s]
HyCoCLIP zeroshot:   1%|▏         | 14/1000 [0

In [46]:
!python -m modelvshuman \
    -m clip \
    -d cue-conflict \
    -b 64 \
    -w 8


Evaluating model clip on dataset cue-conflict using Pytorch Evaluator








accuracy (top-1): 44.06





  0%|          | 0/1000 [00:00<?, ?it/s]
  0%|          | 1/1000 [00:00<07:07,  2.34it/s]
  0%|          | 4/1000 [00:00<01:55,  8.60it/s]
  1%|          | 7/1000 [00:00<01:17, 12.89it/s]
  1%|          | 10/1000 [00:00<01:02, 15.91it/s]
  1%|▏         | 13/1000 [00:00<00:54, 18.01it/s]
  2%|▏         | 16/1000 [00:01<00:50, 19.56it/s]
  2%|▏         | 19/1000 [00:01<00:46, 21.03it/s]
  2%|▏         | 22/1000 [00:01<00:44, 21.75it/s]
  2%|▎         | 25/1000 [00:01<00:44, 22.02it/s]
  3%|▎         | 28/1000 [00:01<00:43, 22.30it/s]
  3%|▎         | 31/1000 [00:01<00:43, 22.35it/s]
  3%|▎         | 34/1000 [00:01<00:42, 22.65it/s]
  4%|▎         | 37/1000 [00:01<00:42, 22.68it/s]
  4%|▍         | 40/1000 [00:02<00:42, 22.69it/s]
  4%|▍         | 43/1000 [00:02<00:41, 23.03it/s]
  5%|▍         | 46/1000 [00:02<00:41, 22.73it/s]
  5%|▍         | 49/1000 [00:02<00:41, 22.73it/s]
  5%|▌         | 52/1000 [00:02<00:41, 22.93it/s]
  6%|▌         | 55/1000 [00:02<00:41, 22.80it/s]
  6%|▌   

In [None]:
img = Image.open("C:\Users\xjzb2\compo_learning\model-vs-human\datasets\cue-conflict\airplane\airplane1-bicycle2.png").convert("RGB")
inp = proc(img).unsqueeze(0).to(device)
log = model.forward_batch(inp)
pred0 = int(log.argmax())
print("Imagenet top1:", pred0, imagenet_classes[pred0])
print("Mapped cue-conflict:", ds.decision_mapping(log)[0],
      ds.loader.dataset.classes[ds.decision_mapping(log)[0]])

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (317916586.py, line 1)