# Image categorization with CLIP

Created by [Artem Konevskikh](https://aiculedssul.net)

In [None]:
#@title Install libraries
!pip install ftfy
!pip install git+https://github.com/openai/CLIP.git

In [None]:
#@title Import libraries
import glob
import json
import os
import clip
import torch
import tqdm
from PIL import Image

In [None]:
#@title Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

In [None]:
#@title Run
# params
#@markdown Path to the directory containing images
img_dir = "/content/images" #@param {"type": "string"}
#@markdown Categories, should be comma separated
categories = "a dog, a cat, a mushroom" #@param {"type": "string"}

# find all image files in the directory
img_ext = ['tif', 'tiff', 'TIF', 'TIFF', 'png', 'PNG', 'jpg', 'jpeg', 'JPG', 'JPEG', 'bmp', 'BMP']
images = []
[images.extend(glob.glob(img_dir + '/*.' + e)) for e in img_ext]

# split categories
cats = [c.strip() for c in categories.split(",")]
text_inputs = torch.cat([clip.tokenize(c) for c in cats]).to(device)
result = []
# process images
for image_path in tqdm.tqdm(images):
  # read image
  image = Image.open(image_path)
  image_input = preprocess(image).unsqueeze(0).to(device)
  # get embeddings
  with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_inputs)
  # normalize
  image_features /= image_features.norm(dim=-1, keepdim=True)
  text_features /= text_features.norm(dim=-1, keepdim=True)
  # get category probabilities
  similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
  values, indices = similarity[0].topk(len(cats))
  result.append({
    "file": image_path,
    "pred": [{"label": str(cats[int(index)]), "prob": float(value)} for value, index in zip(values, indices)]
  })

with open('/content/results.json', 'w') as out_file:
     json.dump(result, out_file, indent = 4, ensure_ascii = False)
