<a href="https://colab.research.google.com/github/ashegde/notebooks/blob/main/exploring_open_clip_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
This notebook contains some very simple commands to gain familiarity with
the open_clip project.
"""

In [None]:
!pip install open_clip_torch --quiet
!pip install datasets --quiet
!wget https://upload.wikimedia.org/wikipedia/commons/0/05/Cat.png --quiet

In [None]:
import torch
from PIL import Image
import open_clip

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
tokenizer = open_clip.get_tokenizer('ViT-B-32')

In [None]:
image = preprocess(Image.open("Cat.png")).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])
model.eval()

with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    # (1,3) = (1, 512) @ (512, 3)

print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]

In [None]:
from datasets import load_dataset

fw = load_dataset('nielsr/datacomp-small-10-rows-with-image-feature', split="train")


In [None]:
images = torch.stack(
    list(map(preprocess, fw[0:-1]['image']))
)

In [None]:
captions = fw[0:-1]['text']
caption_tokens = tokenizer(captions)

In [None]:
text_features = model.encode_text(caption_tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)

image_features = model.encode_image(images)
image_features /= image_features.norm(dim=-1, keepdim=True)

In [None]:
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
# (10, 10) = (10, 512) @ (512, 10)

In [None]:
print("Caption probs:", text_probs)

In [None]:
# highest probability captions for each image
max_prob_captions = text_probs.argmax(dim=-1)
print(
    *[f'\t image {i} -- caption {max_prob_captions[i]}\n' for i in range(text_probs.size(0))]
)

In [None]:
# evidently, image-caption pair 5 is mispredicted.
text_probs[5,:]

In [None]:
captions[5]
# Perhaps this is not surprising. The model likely has no context for mapping names to faces.

In [None]:
import matplotlib.pyplot as plt
image = images[5]
fig, axs = plt.subplots(nrows = 1, ncols=2)
axs[0].imshow(image.permute(1, 2, 0))
axs[0].set_title('Preprocessed')
axs[1].imshow(fw[5]['image'])
axs[1].set_title('Original')
plt.show()

In [None]:
# But the model associates the text "Arthur Bothe" to a person-like image,
# even if it is not the right person.

fig, ax = plt.subplots()
image = images[6]
ax.imshow(image.permute(1, 2, 0))
plt.show()