<a href="https://colab.research.google.com/github/appukundu/experimentation_learning/blob/main/fine_tuning_CLIP_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install Libraries

In [None]:
%%capture

!pip install openai-clip
!pip install datasets
!pip install torch
!pip install tqdm

- Model: openai-clip to define our base CLIP model
- Dataset from Huggin Face
- Torch: Modeling code


In [None]:
from dataclasses import dataclass
from typing import List, Optional
import numpy as np

import clip
import torch


@dataclass
class ModelConfig:
    model_name: str
    enable_jit: bool = False

@dataclass
class InferenceConfig:
    model_config: ModelConfig
    labels: List[str]
    top_k: int = 1
    num_of_inf_samples: Optional[int] = None

@dataclass
class EvalConfig:
  inference_config: InferenceConfig
  metric_name: str


@dataclass
class DataConfig:
  dataset_name: str

In [None]:
from datasets import load_dataset

class Dataset:
  def __init__(self, dataset_name: str):
    self.dataset_name = dataset_name

  def get_dataset(self):
    ds = load_dataset(self.dataset_name)
    self.dataset = ds['train']
    return self.dataset

  def get_labels(self):
    ds = load_dataset(self.dataset_name)
    self.dataset = ds['train']
    self.labels = list(set(self.dataset['subCategory']))
    return self.labels

  def get_dataset_stats(self):
    num_of_samples = len(set(self.dataset['train']['id']))
    print(f"num of samples: {num_of_samples}")
    print(f"masterCatergory {Counter(self.dataset['masterCategory'])}")
    print(f"subCatergory {Counter(self.dataset['subCategory'])}")

  def display(self, idx):
    # Example image
    image = self.dataset[idx]['image']
    display(image)

    # Example data
    print(self.dataset['train'][idx])



In [None]:
class InferenceModel:
    def __init__(self, model_name: str, enable_jit: bool, labels: List[str], top_k: int):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model, self.preprocess = clip.load(model_name, jit=enable_jit)
        self.model.eval()  # Set the model to evaluation mode
        self.top_k = top_k
        self.labels = labels

    def preprocess_data(self, image_data):
        """Preprocess the input data for inference."""
        return self.preprocess(image_data).unsqueeze(0).to(self.device)

    def precomoute_text_features(self, text_data):
        """Precompute text features for all the labels"""
        text_inputs = torch.cat([clip.tokenize(f"a photo of {c}") for c in text_data]).to(self.device)

        with torch.no_grad():
            text_features = self.model.encode_text(text_inputs)
            print(text_features.shape)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        self.text_features =  text_features
        return self.text_features

    def predict(self, data):
        """Perform inference on the preprocessed data."""
        image_input = self.preprocess_data(data)

        # Calculate image features
        with torch.no_grad():
            image_features = self.model.encode_image(image_input)

        # Normalize the image features
        image_features /= image_features.norm(dim=-1, keepdim=True)

        # Calculate similarity between image and text features
        similarity = (100.0 * image_features @ self.text_features.T).softmax(dim=-1)
        values, indices = similarity[0].topk(self.top_k)

        if self.top_k > 1:
          pred_label = []
          for ii in range(len(indices)):
            pred_label.append(self.labels[indices[ii]])
        else:
          pred_label = [self.labels[indices[0]]]

        return pred_label

**Configs**

In [None]:
# dataset config
dataset_name = 'ceyda/fashion-products-small'
dataset_config = DataConfig(dataset_name)
dataset_config.dataset_name

# model config
model_name = "ViT-B/32"
model_config = ModelConfig(model_name)
model_config.model_name

# dataset object init
dataset_obj = Dataset(dataset_config.dataset_name)
labels = dataset_obj.get_labels()

# inference config
inference_config = InferenceConfig(model_config,
                                   dataset_obj.labels,
                                   top_k = 2,
                                   num_of_inf_samples = 100)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/151 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


dataset_infos.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

(…)-00000-of-00002-357f4cbabe1a8ea6.parquet:   0%|          | 0.00/298M [00:00<?, ?B/s]

(…)-00001-of-00002-cbe936f1880f5e72.parquet:   0%|          | 0.00/299M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/42700 [00:00<?, ? examples/s]

In [None]:
# Inference run init
inference_model = InferenceModel(model_config.model_name,
                                 model_config.enable_jit,
                                 inference_config.labels,
                                 inference_config.top_k)

#ToDo: Make it part of init run with a flag
# Computes embedding for the all the classes (ie ther text descriptions)
text_features = inference_model.precomoute_text_features(dataset_obj.labels)

torch.Size([45, 512])


In [None]:
# Execute inference
# Inference Loop
predict_label_list = []
true_label_list = []

for idx in range(inference_config.num_of_inf_samples):
  example = dataset_obj.dataset[idx]
  image_data = example['image']
  true_label = example['subCategory']
  predict_label = inference_model.predict(image_data)
  true_label_list.append(true_label)
  predict_label_list.append(predict_label)
  if (idx % 10) == 0:
    print(f"Predicted: {predict_label}, Actual: {true_label}, for top_k = {inference_config.top_k}")

Predicted: ['Innerwear', 'Topwear'], Actual: Topwear, for top_k = 2
Predicted: ['Sports Accessories', 'Shoes'], Actual: Shoes, for top_k = 2
Predicted: ['Watches', 'Sports Accessories'], Actual: Watches, for top_k = 2
Predicted: ['Innerwear', 'Loungewear and Nightwear'], Actual: Topwear, for top_k = 2
Predicted: ['Watches', 'Topwear'], Actual: Watches, for top_k = 2
Predicted: ['Innerwear', 'Bottomwear'], Actual: Innerwear, for top_k = 2
Predicted: ['Bags', 'Topwear'], Actual: Bags, for top_k = 2
Predicted: ['Topwear', 'Innerwear'], Actual: Topwear, for top_k = 2
Predicted: ['Flip Flops', 'Sandal'], Actual: Flip Flops, for top_k = 2
Predicted: ['Innerwear', 'Bottomwear'], Actual: Innerwear, for top_k = 2


In [None]:
eval_decision = []
for idx in range(len(true_label_list)):
  if true_label_list[idx] in predict_label_list[idx]:
    eval_decision.append(1)
  else:
    eval_decision.append(0)

print(f"Precision of Clip for (top_k = {inference_config.top_k}) is {np.sum(np.array(eval_decision))/inference_config.num_of_inf_samples}")

Precision of Clip for (top_k = 2) is 0.74
