# CLIP vs ResNext Classification

#### Reference

- https://github.com/openai/CLIP
- https://openai.com/blog/clip/

In [None]:
COLAB = True

## Install the libraries

In [None]:
if COLAB:
  # --- Remember to change the runtime to use GPU for better performance
  !nvidia-smi
  !pip install -Uqq ftfy regex tqdm 
  !rm -rf jupyter-notebooks && git clone https://github.com/alpha2phi/jupyter-notebooks.git 
  !rm -rf CLIP && cp -R jupyter-notebooks/nbs/CLIP .
  !rm -rf test_data && cp -R jupyter-notebooks/nbs/test_data .
else:
  !pip install -Uqq ftfy regex tqdm torch torchvision

Mon Jan 11 15:50:34 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Import Libaries and Model

In [None]:
import os
import torch
from CLIP import clip
from PIL import Image

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

print(f"Device - {device}")

100%|███████████████████████| 353976522/353976522 [00:05<00:00, 60108148.06it/s]


Device - cuda


## Prediction

In [None]:
with open("CLIP/imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]

def predict_clip(image_file_path):
    image = preprocess(Image.open(image_file_path)).unsqueeze(0).to(device)
    text = clip.tokenize(categories).to(device)

    # Calculate features
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

    # Pick the top 5 most similar labels for the image
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    values, indices = similarity[0].topk(5)

    print("\nTop predictions:\n")
    for value, index in zip(values, indices):
        print(f"{categories[index]:>16s}: {100 * value.item():.2f}%")

In [None]:
predict_clip("test_data/images/bear.jpg")


Top predictions:

      brown bear: 92.33%
American black bear: 5.21%
        bearskin: 0.58%
        ice bear: 0.30%
      sloth bear: 0.08%


In [None]:
predict_clip("test_data/images/bird.png")


Top predictions:

  Scotch terrier: 0.68%
wire-haired fox terrier: 0.45%
          Loafer: 0.41%
   parallel bars: 0.40%
   Irish terrier: 0.38%
