In [3]:
pip install clip

Collecting clipNote: you may need to restart the kernel to use updated packages.

  Downloading clip-0.2.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: clip
  Building wheel for clip (setup.py): started
  Building wheel for clip (setup.py): finished with status 'done'
  Created wheel for clip: filename=clip-0.2.0-py3-none-any.whl size=7017 sha256=09178d8a0f211a1205386638effd9ed7936923ca246691619436081882c510e0
  Stored in directory: c:\users\bhav1\appdata\local\pip\cache\wheels\ab\a5\e8\c9fa20742edbccf2702dae8ee62053e6c460e961d45967b49c
Successfully built clip
Installing collected packages: clip
Successfully installed clip-0.2.0


In [5]:
pip install ftfy regex

Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import faiss
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification

processor_hf = AutoProcessor.from_pretrained("chanwkim/monet")
model_hf = AutoModelForZeroShotImageClassification.from_pretrained("chanwkim/monet")
device = "cuda" if torch.cuda.is_available() else "cpu"
model_hf.to(device)
model_hf.eval()

class MONETHuggingFaceDataset(Dataset):
    def __init__(self, image_paths, captions, processor):
        self.image_paths = image_paths
        self.captions = captions
        self.processor = processor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        caption = self.captions[idx]
        inputs = self.processor(text=[caption], images=image, return_tensors="pt", padding="max_length", truncation=True)
        return {
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
        }

# Load dataset
csv_path = "finetune_dataset.csv"
df = pd.read_csv(csv_path)
image_paths = df["filepath"].tolist()
captions = df["caption"].fillna("").tolist()

dataset = MONETHuggingFaceDataset(image_paths, captions, processor_hf)
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

def extract_features(model, dataloader, device):
    model.eval()
    image_features, text_features = [], []

    with torch.no_grad():
        for batch in dataloader:
            pixel_values = batch["pixel_values"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask)
            img_feats = outputs.image_embeds
            txt_feats = outputs.text_embeds

            img_feats = img_feats / img_feats.norm(dim=-1, keepdim=True)
            txt_feats = txt_feats / txt_feats.norm(dim=-1, keepdim=True)

            image_features.append(img_feats.cpu())
            text_features.append(txt_feats.cpu())

    image_features = torch.cat(image_features, dim=0)
    text_features = torch.cat(text_features, dim=0)

    return image_features, text_features


def compute_cosine_similarity(image_features, text_features):
    return torch.matmul(image_features, text_features.T)


def cross_modal_retrieval(image_features, text_features, image_paths, captions, k=5):
    similarity_matrix = compute_cosine_similarity(image_features, text_features)
    for i in range(len(image_features)):
        print(f"\nImage {i + 1}: {image_paths[i]}")
        top_text_indices = similarity_matrix[i].topk(k, largest=True).indices
        for rank, idx in enumerate(top_text_indices):
            print(f"  Rank {rank + 1}: {captions[idx]}")

    for j in range(len(text_features)):
        print(f"\nText {j + 1}: {captions[j]}")
        top_image_indices = similarity_matrix[:, j].topk(k, largest=True).indices
        for rank, idx in enumerate(top_image_indices):
            print(f"  Rank {rank + 1}: {image_paths[idx]}")

image_features, text_features = extract_features(model_hf, test_loader, device)
cross_modal_retrieval(image_features, text_features, image_paths, captions, k=5)



Image 1: c:/Users/bhav1/Downloads/dermavqa/images/train\11mk4th.png
  Rank 1: hairline, seborrheic keratosis seborrheic keratosis, scalp
  Rank 2: image, skin, visible, lesion, skin, discoloration, lesion
  Rank 3: inflammatory, actinic keratosis, medical history
  Rank 4: image, wrist, wrinkle fine line, visible, irregularity, image, car, changing, lesion, biopsy
  Rank 5: image, skin, skin, coloration, dermatitis, wear

Image 2: c:/Users/bhav1/Downloads/dermavqa/images/train\ih99w9.jpg
  Rank 1: skin tag, skin tag, benign, skin lesion, raised, skin tag, benign, lesion
  Rank 2: skin single, flat, skin, skin fair, hair, visible, irregularity, bump discoloration, mole
  Rank 3: image, foot, itchy, varicose vein, daily, itching
  Rank 4: image, fingernail skin, fingernail, fingernail, fingertip, skin texture, irregularity, inflammation, thick, irritant
  Rank 5: folliculitis, follicular cyst, medical history, physical examination, skin, histopathological

Image 3: c:/Users/bhav1/Downlo

In [6]:
model_hf

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

In [9]:
from sklearn.metrics import roc_auc_score

def compute_auc(similarity_matrix, num_samples):
    ground_truth = torch.eye(num_samples).to(similarity_matrix.device)
    similarity_flat = similarity_matrix.flatten().cpu().numpy()
    ground_truth_flat = ground_truth.flatten().cpu().numpy()

    auc_score = roc_auc_score(ground_truth_flat, similarity_flat)
    return auc_score

auc_score = compute_auc(similarity_matrix, len(image_features))
print(f"AUC Score: {auc_score:.4f}")


AUC Score: 0.7960
