Install Libraries

In [3]:
%%capture

!pip install openai-clip
!pip install datasets
!pip install torch
!pip install tqdm

- Model: openai-clip to define our base CLIP model
- Dataset from Huggin Face
- Torch: Modeling code


**Zero-shot classifcation performance of CLIP**

In [4]:
from dataclasses import dataclass
from typing import List, Optional
import numpy as np

import clip
import torch


@dataclass
class ModelConfig:
    model_name: str
    enable_jit: bool = False

@dataclass
class InferenceConfig:
    model_config: ModelConfig
    labels: List[str]
    top_k: int = 1
    num_of_inf_samples: Optional[int] = None

@dataclass
class EvalConfig:
  inference_config: InferenceConfig
  metric_name: str


@dataclass
class DataConfig:
  dataset_name: str

In [17]:
from datasets import load_dataset
from collections import Counter


class Dataset:
  def __init__(self, dataset_name: str):
    self.dataset_name = dataset_name
    ds = load_dataset(self.dataset_name)
    self.dataset = ds['train']

  def get_dataset(self):
    ds = load_dataset(self.dataset_name)
    self.dataset = ds['train']
    return self.dataset

  def get_labels(self):
    ds = load_dataset(self.dataset_name)
    self.dataset = ds['train']
    self.labels = list(set(self.dataset['subCategory']))
    return self.labels

  def get_dataset_stats(self):
    num_of_samples = len(set(self.dataset['id']))
    print(f"num of samples: {num_of_samples}")
    print(f"masterCatergory {Counter(self.dataset['masterCategory'])}")
    print(f"subCatergory {Counter(self.dataset['subCategory'])}")

  def display(self, idx):
    # Example image
    image = self.dataset[idx]['image']
    display(image)

    # Example data
    print(self.dataset[idx])



In [6]:
class InferenceModel:
    def __init__(self, model_name: str, enable_jit: bool, labels: List[str], top_k: int):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model, self.preprocess = clip.load(model_name, jit=enable_jit)
        self.model.eval()  # Set the model to evaluation mode
        self.top_k = top_k
        self.labels = labels

    def preprocess_data(self, image_data):
        """Preprocess the input data for inference."""
        return self.preprocess(image_data).unsqueeze(0).to(self.device)

    def precomoute_text_features(self, text_data) -> torch.Tensor:
        """Precompute text features for all the labels"""
        text_inputs = torch.cat([clip.tokenize(f"a photo of {c}") for c in text_data]).to(self.device)

        with torch.no_grad():
            text_features = self.model.encode_text(text_inputs)
            print(text_features.shape)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        self.text_features =  text_features
        return self.text_features

    def predict(self, data) -> List[str]:
        """Perform inference on the preprocessed data."""
        image_input = self.preprocess_data(data)

        # Calculate image features
        with torch.no_grad():
            image_features = self.model.encode_image(image_input)

        # Normalize the image features
        image_features /= image_features.norm(dim=-1, keepdim=True)

        # Calculate similarity between image and text features
        similarity = (100.0 * image_features @ self.text_features.T).softmax(dim=-1)
        values, indices = similarity[0].topk(self.top_k)

        if self.top_k > 1:
          pred_label = []
          for ii in range(len(indices)):
            pred_label.append(self.labels[indices[ii]])
        else:
          pred_label = [self.labels[indices[0]]]

        return pred_label

**Configs**

In [18]:
# dataset config
dataset_name = 'ceyda/fashion-products-small'
dataset_config = DataConfig(dataset_name)
dataset_config.dataset_name

# model config
model_name = "ViT-B/32"
model_config = ModelConfig(model_name)
model_config.model_name

# dataset object init
dataset_obj = Dataset(dataset_config.dataset_name)
labels = dataset_obj.get_labels()

# inference config
inference_config = InferenceConfig(model_config,
                                   dataset_obj.labels,
                                   top_k = 1,
                                   num_of_inf_samples = 200)


Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


In [20]:
dataset_obj.get_dataset_stats()

num of samples: 42700
masterCatergory Counter({'Apparel': 20332, 'Accessories': 11133, 'Footwear': 9177, 'Personal Care': 1929, 'Free Items': 103, 'Sporting Goods': 25, 'Home': 1})
subCatergory Counter({'Topwear': 14737, 'Shoes': 7316, 'Bags': 3022, 'Watches': 2522, 'Bottomwear': 2453, 'Innerwear': 1679, 'Jewellery': 1072, 'Eyewear': 1044, 'Fragrance': 1003, 'Sandal': 958, 'Flip Flops': 903, 'Wallets': 893, 'Belts': 808, 'Socks': 687, 'Loungewear and Nightwear': 468, 'Dress': 451, 'Saree': 426, 'Headwear': 290, 'Lips': 267, 'Ties': 258, 'Nails': 228, 'Makeup': 211, 'Accessories': 136, 'Scarves': 118, 'Cufflinks': 108, 'Apparel Set': 106, 'Free Gifts': 102, 'Stoles': 89, 'Skin Care': 77, 'Skin': 66, 'Mufflers': 38, 'Eyes': 38, 'Shoe Accessories': 24, 'Sports Equipment': 21, 'Gloves': 19, 'Hair': 18, 'Bath and Body': 12, 'Water Bottle': 7, 'Perfumes': 6, 'Umbrellas': 6, 'Beauty Accessories': 4, 'Wristbands': 4, 'Sports Accessories': 3, 'Home Furnishing': 1, 'Vouchers': 1})


In [21]:
# Inference run init
inference_model = InferenceModel(model_config.model_name,
                                 model_config.enable_jit,
                                 inference_config.labels,
                                 inference_config.top_k)

#ToDo: Make it part of init run with a flag
# Computes embedding for the all the classes (ie ther text descriptions)
text_features = inference_model.precomoute_text_features(dataset_obj.labels)

torch.Size([45, 512])


In [22]:
# Execute inference
# Inference Loop
predict_label_list = []
true_label_list = []

for idx in range(inference_config.num_of_inf_samples):
  example = dataset_obj.dataset[idx]
  image_data = example['image']
  true_label = example['subCategory']
  predict_label = inference_model.predict(image_data)
  true_label_list.append(true_label)
  predict_label_list.append(predict_label)
  if (idx % 10) == 0:
    print(f"Predicted: {predict_label}, Actual: {true_label}, for top_k = {inference_config.top_k}")

Predicted: ['Innerwear'], Actual: Topwear, for top_k = 1
Predicted: ['Sports Accessories'], Actual: Shoes, for top_k = 1
Predicted: ['Watches'], Actual: Watches, for top_k = 1
Predicted: ['Innerwear'], Actual: Topwear, for top_k = 1
Predicted: ['Watches'], Actual: Watches, for top_k = 1
Predicted: ['Innerwear'], Actual: Innerwear, for top_k = 1
Predicted: ['Bags'], Actual: Bags, for top_k = 1
Predicted: ['Topwear'], Actual: Topwear, for top_k = 1
Predicted: ['Flip Flops'], Actual: Flip Flops, for top_k = 1
Predicted: ['Innerwear'], Actual: Innerwear, for top_k = 1
Predicted: ['Shoes'], Actual: Flip Flops, for top_k = 1
Predicted: ['Innerwear'], Actual: Innerwear, for top_k = 1
Predicted: ['Shoes'], Actual: Shoes, for top_k = 1
Predicted: ['Bags'], Actual: Bags, for top_k = 1
Predicted: ['Shoes'], Actual: Shoes, for top_k = 1
Predicted: ['Innerwear'], Actual: Innerwear, for top_k = 1
Predicted: ['Watches'], Actual: Watches, for top_k = 1
Predicted: ['Shoes'], Actual: Shoes, for top_k = 

In [23]:
def eval_precision(true_label_list, predict_label_list):
  """Compute precsiion top_k precision"""
  eval_decision = []
  for idx in range(len(true_label_list)):
    if true_label_list[idx] in predict_label_list[idx]:
      eval_decision.append(1)
    else:
      eval_decision.append(0)

  print(f"Precision of Clip for (top_k = {inference_config.top_k}) is {np.sum(np.array(eval_decision))/inference_config.num_of_inf_samples}")
  return eval_decision

eval_metric = eval_precision(true_label_list, predict_label_list)

Precision of Clip for (top_k = 1) is 0.565


**Fine Tuning CLIP**