## Introduction to PyTorch, HuggingFace, and Google Colab

## 1. Check installed packages.

In [None]:
# show all installed packages
!pip list

In [None]:
# we can also check if a specific package is installed or not by using pip
!pip list | grep -i torch

## 2. Load CLIP model to GPU and Report GPU memory usage.

In [None]:
# we require transformers to load clip model, let's see if it's installed.
!pip list | grep -i transformers

## Check GPU memory usage before loading the model (see the usage at xxxMiB / 15360MiB)

In [None]:
!nvidia-smi

## Load the model to GPU (Make sure to use GPU runtime)

In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", device_map=device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", device_map=device)
print("Model loaded to ",device)

## check GPU usage after loading the CLIP model. (see the usage at xxxMiB / 15360MiB)

In [None]:
!nvidia-smi

# Running CLIP model on sample image

In [None]:
# We require requests to fetch Image from the URL, PIL to load the image and matplotlib to show an image
import requests
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
# lets make a inference image function because we will be reusing this code for multiple inferences.

def infer_model(processor, model, image, captions, device="cpu", isImageURL=False, isImagePath=False):
  if isImageURL:
    image = Image.open(requests.get(image, stream=True).raw)
  elif isImagePath:
    image = Image.open(image)

  # pass two sample captions for the input image to check its similarity
  inputs = processor(text=captions, images=image, return_tensors="pt", padding=True)
  inputs.to(device)

  # infer the model
  with torch.no_grad():
    with torch.autocast(device):
      outputs = model(**inputs)

  logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
  probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

  return logits_per_image, probs

In [None]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

plt.imshow(image)

# List the captions to test against the image
captions = [ "a photo of a cat", "a photo of a dog"]

logits_per_image, probs = infer_model(processor, model, url, captions, device, isImageURL=True)

# print probs of each caption
for i,cap in enumerate(captions):
  print(f"{cap}: {probs[0][i].item():.4f}")


# Now lets upload a sample image from our local machine to drive and run the inference on that

In [None]:
# follow the connect to drive popup instructions
from google.colab import drive
drive.mount('/content/drive')

# uploading personcakefridge.png image to drive (attached with this notebook)

In [None]:
import os # to join the path
IMAGE_FILE_NAME = "personcakefridge.png" # change if you are using some other image.
image = Image.open(os.path.join("/content/drive/MyDrive", IMAGE_FILE_NAME))
plt.imshow(image)

In [None]:
# create 5 sample captions for this image
captions = [
    "a photo of a person holding a baby",
    "a photo of cake on the table",
    "a photo of a family celebrating birthday of a little girl",
    "a photo of a person standing in front of a fridge",
    "a photo of a person cutting the cake"
]

# get inference for given image and captions
logits_per_image, probs = infer_model(processor, model, image, captions, device, isImageURL=False, isImagePath=False)

# print probs of each caption
for i,cap in enumerate(captions):
  print(f"{cap}: [{probs[0][i].item():.4f}]")

# get best caption
best_caption_idx = probs[0].argmax().item()
best_caption = captions[best_caption_idx]
print(f"\n\n\n Best caption: {best_caption} [{probs[0][best_caption_idx]:.4f}]")