# CLIP vs ResNext Classification

#### Reference

- https://github.com/openai/CLIP
- https://openai.com/blog/clip/

In [23]:
COLAB = True

## Install the libraries

In [4]:
if COLAB:
    !conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
    !pip install -Uqq ftfy regex tqdm
else:
    !pip install -Uqq ftfy regex tqdm torch torchvision

## Import Libaries and Model

In [13]:
import os
import torch
from CLIP import clip
from PIL import Image
from torchvision.datasets import CIFAR100

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

##

## Prediction

In [17]:
def predict_clip(image_file_path):
    image = preprocess(Image.open(image_file_path)).unsqueeze(0).to(device)
    with open("imagenet_classes.txt", "r") as f:
        categories = [s.strip() for s in f.readlines()]
    text = clip.tokenize(categories).to(device)

    # Calculate features
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

    # Pick the top 5 most similar labels for the image
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    values, indices = similarity[0].topk(5)

    print("\nTop predictions:\n")
    for value, index in zip(values, indices):
        print(f"{categories[index]:>16s}: {100 * value.item():.2f}%")

In [19]:
predict("test_data/images/bear.jpg")


Top predictions:

      brown bear: 92.40%
American black bear: 5.19%
        bearskin: 0.57%
        ice bear: 0.30%
          marmot: 0.08%


In [20]:
predict("test_data/images/bird.png")


Top predictions:

           robin: 25.30%
       brambling: 18.37%
          bulbul: 18.20%
          coucal: 7.87%
          orange: 7.58%
