In [None]:
!pip install -q transformers pillow torch torchvision numpy
import torch
import numpy as np
from PIL import Image
from transformers import AutoProcessor, AutoModel


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_grad_enabled(False)


torch.autograd.grad_mode.set_grad_enabled(mode=False)

In [3]:
MODEL_NAME = "google/siglip-base-patch16-224"

processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/711 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/813M [00:00<?, ?B/s]

SiglipModel(
  (text_model): SiglipTextTransformer(
    (embeddings): SiglipTextEmbeddings(
      (token_embedding): Embedding(32000, 768)
      (position_embedding): Embedding(64, 768)
    )
    (encoder): SiglipEncoder(
      (layers): ModuleList(
        (0-11): 12 x SiglipEncoderLayer(
          (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (self_attn): SiglipAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (mlp): SiglipMLP(
            (activation_fn): GELUTanh()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features

In [5]:
def load_image(path):
    img = Image.open(path).convert("RGB")
    return img
image = load_image("/content/sample.jpg")


In [6]:
def compute_global_embedding(image):
    inputs = processor(images=image, return_tensors="pt").to(device)
    outputs = model.get_image_features(**inputs)
    embedding = outputs / outputs.norm(dim=-1, keepdim=True)
    return embedding.squeeze().cpu().numpy()
E_global = compute_global_embedding(image)
print(E_global.shape)


(768,)


In [7]:
def split_into_regions(image, grid_size=2):
    w, h = image.size
    regions = []

    step_w = w // grid_size
    step_h = h // grid_size

    for i in range(grid_size):
        for j in range(grid_size):
            left = i * step_w
            upper = j * step_h
            right = (i + 1) * step_w
            lower = (j + 1) * step_h
            regions.append(image.crop((left, upper, right, lower)))

    return regions
def compute_local_embeddings(image, grid_size=2):
    regions = split_into_regions(image, grid_size)
    local_embeddings = []

    for region in regions:
        emb = compute_global_embedding(region)
        local_embeddings.append(emb)

    return np.stack(local_embeddings)
E_local = compute_local_embeddings(image, grid_size=2)
print(E_local.shape)


(4, 768)


In [8]:
image_id = "img_0001"

artifact = {
    "image_id": image_id,
    "embedding_global": E_global.tolist(),
    "embedding_local": E_local.tolist()
}


In [9]:
import json

with open(f"{image_id}_siglip_embeddings.json", "w") as f:
    json.dump(artifact, f)


In [10]:
# Self similarity should be ~1
sim = np.dot(E_global, E_global)
print("Self similarity:", sim)
# Local vs global similarity distribution
local_sims = E_local @ E_global
print("Local similarities:", local_sims)


Self similarity: 1.0
Local similarities: [0.79976124 0.91595787 0.68398166 0.869001  ]


TESTING


In [11]:
import numpy as np

# Shape checks
assert E_global.ndim == 1, "Global embedding must be 1D"
assert E_local.ndim == 2, "Local embeddings must be 2D"

# Norm checks
global_norm = np.linalg.norm(E_global)
local_norms = np.linalg.norm(E_local, axis=1)

print("Global norm:", global_norm)
print("Local norms:", local_norms)

assert np.isclose(global_norm, 1.0, atol=1e-3)
assert np.allclose(local_norms, 1.0, atol=1e-3)


Global norm: 1.0
Local norms: [0.99999994 1.         0.99999994 1.        ]


In [12]:
E_global_2 = compute_global_embedding(image)

diff = np.linalg.norm(E_global - E_global_2)
print("Difference between runs:", diff)


Difference between runs: 0.0


In [13]:
img_B = load_image("/content/similar.jpg")
img_C = load_image("/content/different.jpg")

E_A = E_global
E_B = compute_global_embedding(img_B)
E_C = compute_global_embedding(img_C)

sim_AB = np.dot(E_A, E_B)
sim_AC = np.dot(E_A, E_C)

print("Similarity A-B (similar):", sim_AB)
print("Similarity A-C (different):", sim_AC)


Similarity A-B (similar): 0.7941096
Similarity A-C (different): 0.5390408


In [14]:
local_sims = E_local @ E_global

print("Local → Global similarities:")
for i, s in enumerate(local_sims):
    print(f"Region {i}: {s:.4f}")

print("Std dev:", np.std(local_sims))


Local → Global similarities:
Region 0: 0.7998
Region 1: 0.9160
Region 2: 0.6840
Region 3: 0.8690
Std dev: 0.08730362


In [15]:
E_local_2x2 = compute_local_embeddings(image, grid_size=2)
E_local_3x3 = compute_local_embeddings(image, grid_size=3)

print("2x2 shape:", E_local_2x2.shape)
print("3x3 shape:", E_local_3x3.shape)


2x2 shape: (4, 768)
3x3 shape: (9, 768)


In [16]:
import json

with open(f"{image_id}_siglip_embeddings.json") as f:
    loaded = json.load(f)

E_loaded = np.array(loaded["embedding_global"])
print("Reload diff:", np.linalg.norm(E_loaded - E_global))


Reload diff: 0.0
