In [1]:
import torch, numpy as np
from PIL import Image
from tqdm import tqdm
from torchvision.transforms import ToPILImage
import sys
sys.path.append(".")

from modelvshuman.models.wrappers.pytorch import HyCoCLIPModel
from modelvshuman.utils import load_dataset
from hycoclip.lorentz import pairwise_dist
from modelvshuman.models.pytorch.clip.imagenet_classes import imagenet_classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


  from .autonotebook import tqdm as notebook_tqdm







In [2]:
model = HyCoCLIPModel("hycoclip", "cue-conflict")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.model.to(device)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


HyCoCLIP(
  (visual): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (patch_drop): Identity()
    (norm_pre): Identity()
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=384, out_features=1152, bias=True)
          (q_norm): Identity()
          (k_norm): Identity()
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): Identity()
        (drop_path1): Identity()
        (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (act): GELU(approximate='none')
  

In [3]:
ds = load_dataset("cue-conflict", split="val")
loader = ds.loader

In [4]:
params  = list(model.model.parameters())
tot     = sum(p.numel() for p in params)
nz      = sum((p.abs()>1e-6).sum().item() for p in params)
print("loaded params:", nz, "/", tot, f"({100*nz/tot:.1f}%)")

zs = model.zeroshot_weights
print("zeroshot_weights:", zs.shape,
      "min/max/mean:", zs.min(), zs.max(), zs.mean())
print("first norms:", zs.norm(dim=0)[:10])

loaded params: 84315215 / 85290372 (98.9%)
zeroshot_weights: torch.Size([512, 1000]) min/max/mean: tensor(-0.2459, device='cuda:0') tensor(0.2461, device='cuda:0') tensor(0.0002, device='cuda:0')
first norms: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000], device='cuda:0')


In [5]:
imgs, targets, _ = next(iter(loader))
print("raw:", imgs.shape, imgs.min(), imgs.max())

proc = model.preprocess()
pimgs = [proc(ToPILImage()(im)) for im in imgs[:4]]
batch = torch.stack(pimgs)
print("pre:", batch.shape, batch.min(), batch.max())

raw: torch.Size([16, 3, 224, 224]) tensor(-2.1179) tensor(2.6400)
pre: torch.Size([4, 3, 224, 224]) tensor(-2.1179) tensor(2.6400)


In [6]:
logits = model.forward_batch(imgs[:4].to(device))
print("logits:", logits.shape,
      "min/max/mean:", logits.min(), logits.max(), logits.mean())

top5 = np.argsort(-logits, axis=1)[:4,:5]
for i in range(4):
    print(f"sample {i} top5:", top5[i])

logits: (4, 1000) min/max/mean: 41.89975 73.410225 57.78988
sample 0 top5: [632 714 715 545 452]
sample 1 top5: [632 641 781 577 920]
sample 2 top5: [919 632 818 714 971]
sample 3 top5: [919 489 616 490 824]


In [9]:
print(imagenet_classes)
print(len(imagenet_classes))

['tench', 'goldfish', 'great white shark', 'tiger shark', 'hammerhead shark', 'electric ray', 'stingray', 'rooster', 'hen', 'ostrich', 'brambling', 'goldfinch', 'house finch', 'junco', 'indigo bunting', 'American robin', 'bulbul', 'jay', 'magpie', 'chickadee', 'American dipper', 'kite (bird of prey)', 'bald eagle', 'vulture', 'great grey owl', 'fire salamander', 'smooth newt', 'newt', 'spotted salamander', 'axolotl', 'American bullfrog', 'tree frog', 'tailed frog', 'loggerhead sea turtle', 'leatherback sea turtle', 'mud turtle', 'terrapin', 'box turtle', 'banded gecko', 'green iguana', 'Carolina anole', 'desert grassland whiptail lizard', 'agama', 'frilled-necked lizard', 'alligator lizard', 'Gila monster', 'European green lizard', 'chameleon', 'Komodo dragon', 'Nile crocodile', 'American alligator', 'triceratops', 'worm snake', 'ring-necked snake', 'eastern hog-nosed snake', 'smooth green snake', 'kingsnake', 'garter snake', 'water snake', 'vine snake', 'night snake', 'boa constrictor

In [10]:
probs = np.zeros((1, logits.shape[1]), dtype=np.float32)
idx   = imagenet_classes.index("hen")
probs[0, idx] = 1.0
print("mapped hen->", ds.decision_mapping(probs)[0])

mapped hen-> ['bird' 'truck' 'knife' 'oven' 'elephant' 'dog' 'clock' 'keyboard' 'chair'
 'cat' 'bottle' 'car' 'boat' 'bicycle' 'bear' 'airplane']


In [11]:
print("=== ImageNet top-5 per sample ===")
for i in range(min(4, logits.shape[0])):
    top5_idx = np.argsort(-logits[i])[:5]
    top5_names = [imagenet_classes[j] for j in top5_idx]
    print(f"sample {i:2d}:", top5_names)

=== ImageNet top-5 per sample ===
sample  0: ['music speaker', 'plectrum', 'Pickelhaube', 'electric fan', 'poke bonnet']
sample  1: ['music speaker', 'maraca', 'scoreboard', 'gong', 'traffic light']
sample  2: ['traffic or street sign', 'music speaker', 'spotlight', 'plectrum', 'bubble']
sample  3: ['traffic or street sign', 'chain-link fence', 'knot', 'chain mail', 'scarf']


In [12]:
import torch

logits_t = torch.from_numpy(logits)        #[B×1000]
probs_t = torch.softmax(logits_t, dim=1)   
probs = probs_t.numpy()                    #[B×1000]
mapped = ds.decision_mapping(probs)
print("mapped:", type(mapped), "dtype:", mapped.dtype, "shape:", mapped.shape)
print("first row:", mapped[0])
print("second row:", mapped[1])

print("\n=== Mapped cue-conflict predictions ===")
top1_names = mapped[:, 0]    #length-B array

for i, name in enumerate(top1_names[:4]):
    print(f"sample {i:2d}: {name}")

mapped: <class 'numpy.ndarray'> dtype: <U8 shape: (4, 16)
first row: ['bottle' 'clock' 'keyboard' 'car' 'bird' 'dog' 'airplane' 'truck' 'bear'
 'oven' 'chair' 'knife' 'elephant' 'bicycle' 'boat' 'cat']
second row: ['clock' 'bird' 'truck' 'car' 'dog' 'chair' 'elephant' 'bottle' 'boat'
 'oven' 'bear' 'cat' 'airplane' 'keyboard' 'knife' 'bicycle']

=== Mapped cue-conflict predictions ===
sample  0: bottle
sample  1: clock
sample  2: clock
sample  3: car


In [26]:
hyco = model.model
hyco.eval()
with torch.no_grad():
    feats1 = hyco.encode_image(batch[:5].to(device), project=True)
    feats2 = hyco.encode_image(batch[:5].clone().to(device), project=True)
    print("identical? ", torch.allclose(feats1, feats2))
    # check variability
    d = (feats1[0] - feats1[1]).norm().item()
    print("feature distance between sample 0 & 1:", d)


identical?  True
feature distance between sample 0 & 1: 0.1923612803220749


In [13]:
import torch, gc
from modelvshuman.utils import load_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

gc.collect()
torch.cuda.empty_cache()

model_wrapper, _ = load_model("clip")
ds = load_dataset("cue-conflict", batch_size=16, num_workers=16)
loader = ds.loader

model_wrapper.model.to(device)
model_wrapper.model.eval()



100%|██████████| 1000/1000 [00:43<00:00, 23.19it/s]


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [14]:
!python -m modelvshuman \
    -m hycoclip \
    -d cue-conflict \
    -b 64 \
    -w 8


Evaluating model hycoclip on dataset cue-conflict using Pytorch Evaluator








accuracy (top-1): 6.25




  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


















  5%|▌         | 1/20 [02:04<39:29, 124.72s/it]
 10%|█         | 2/20 [02:05<15:29, 51.62s/it] 
 15%|█▌        | 3/20 [02:05<07:59, 28.23s/it]
 20%|██        | 4/20 [02:05<04:35, 17.24s/it]
 25%|██▌       | 5/20 [02:06<02:47, 11.14s/it]
 30%|███       | 6/20 [02:06<01:44,  7.45s/it]
 35%|███▌      | 7/20 [02:06<01:06,  5.10s/it]
 40%|████      | 8/20 [02:07<00:42,  3.56s/it]
 45%|████▌     | 9/20 [02:07<00:27,  2.54s/it]
 50%|█████     | 10/20 [02:07<00:18,  1.84s/it]
 55%|█████▌    | 11/20 [02:07<00:12,  1.36s/it]
 60%|██████    | 12/20 [02:08<00:08,  1.04s/it]
 65%|██████▌   | 13/20 [02:08<00:05,  1.24it/s]
 70%|███████   | 14/20 [02:08<00:03,  1.55it/s]
 75%|███████▌  | 15/20 [02:09<00:02,  1.88it/s]
 80%|████████  | 16/20 [02:09<00:01,  2.22it/s]
 85%|████████▌ | 17/20 [02:09<00:01,  2.42it/s]
 90%|█████████ | 18/20 [02:09<00:00,  2.69it/s]
 95%|█████████▌| 19/20 [02:10<00:00,  2.93it/s]
100%

In [15]:
!python -m modelvshuman \
    -m clip \
    -d cue-conflict \
    -b 64 \
    -w 8


Evaluating model clip on dataset cue-conflict using Pytorch Evaluator








accuracy (top-1): 44.06





  0%|          | 0/1000 [00:00<?, ?it/s]
  0%|          | 1/1000 [00:00<03:26,  4.83it/s]
  0%|          | 4/1000 [00:00<01:12, 13.77it/s]
  1%|          | 7/1000 [00:00<00:56, 17.54it/s]
  1%|          | 10/1000 [00:00<00:48, 20.27it/s]
  1%|▏         | 13/1000 [00:00<00:44, 22.06it/s]
  2%|▏         | 16/1000 [00:00<00:43, 22.40it/s]
  2%|▏         | 19/1000 [00:00<00:41, 23.45it/s]
  2%|▏         | 22/1000 [00:01<00:40, 24.17it/s]
  2%|▎         | 25/1000 [00:01<00:39, 24.88it/s]
  3%|▎         | 28/1000 [00:01<00:39, 24.50it/s]
  3%|▎         | 31/1000 [00:01<00:40, 24.10it/s]
  3%|▎         | 34/1000 [00:01<00:40, 24.11it/s]
  4%|▎         | 37/1000 [00:01<00:42, 22.73it/s]
  4%|▍         | 40/1000 [00:01<00:42, 22.43it/s]
  4%|▍         | 43/1000 [00:01<00:40, 23.47it/s]
  5%|▍         | 46/1000 [00:02<00:39, 24.20it/s]
  5%|▍         | 49/1000 [00:02<00:38, 24.74it/s]
  5%|▌         | 52/1000 [00:02<00:38, 24.94it/s]
  6%|▌         | 55/1000 [00:02<00:37, 25.10it/s]
  6%|▌   