In [1]:
import torch
from PIL import Image
from torchvision import transforms
from transformers import CLIPProcessor, CLIPModel

  import pynvml  # type: ignore[import]


In [2]:
try:
    import accelerate
    print(f"accelerate is installed: version {accelerate.__version__}")
except ImportError:
    print("accelerate is NOT installed in this environment")

accelerate is installed: version 1.10.1


In [3]:
# get the device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

In [4]:
!ls

[34mblog_pics[m[m
CLIP_Embedding_samples_cats_pineapples.ipynb
[34mimages[m[m
README.md


In [6]:
# 1. Load pretrained CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [7]:
# 2. Load example images
cat_image = Image.open("images/cat.png")
pineapple_image = Image.open("images/pineapple.png")

In [10]:
# 3. Define the text prompts
texts = ["a photo of a cat", "a photo of a pineapple"]

# 4. Preprocess the data
inputs = processor(
    text=texts,
    images=[cat_image, pineapple_image],
    return_tensors="pt",
    padding=True
).to(device)

In [11]:
# 5. Forward pass
with torch.no_grad():
    outputs = model(**inputs)
    image_embeds = outputs.image_embeds    # shape [2, 512]
    text_embeds = outputs.text_embeds      # shape [2, 512]

In [12]:
# 6. Normalize embeddings (L2 normalization)
image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

In [13]:
# 7. Compute cosine similarities
cosine_sim = image_embeds @ text_embeds.T  # 2x2 matrix
print("Cosine similarity matrix:")
print(cosine_sim)

Cosine similarity matrix:
tensor([[0.2832, 0.2036],
        [0.1956, 0.3286]], device='mps:0')
