In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torchvision import models, transforms
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import faiss
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('finetune_dataset.csv')
df = df.dropna(subset=['caption']).reset_index(drop=True)
df['caption'] = df['caption'].astype(str)
filepaths = df['filepath'].tolist()
captions = df['caption'].tolist()

In [4]:
train_files, test_files, train_captions, test_captions = train_test_split(filepaths, captions, test_size=0.2, random_state=42)
val_files, test_files, val_captions, test_captions = train_test_split(test_files, test_captions, test_size=0.5, random_state=42)

In [5]:
class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        vgg = models.vgg16(pretrained=True)
        self.features = nn.Sequential(*list(vgg.features.children()))
        self.fc = nn.Linear(512 * 7 * 7, 512)

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x / x.norm(dim=-1, keepdim=True)

In [6]:
class TextEncoder(nn.Module):
    def __init__(self):
        super(TextEncoder, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.fc = nn.Linear(768, 512)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = self.fc(outputs.last_hidden_state[:, 0, :]) 
        return x / x.norm(dim=-1, keepdim=True)


In [7]:
class CrossModalModel(nn.Module):
    def __init__(self):
        super(CrossModalModel, self).__init__()
        self.image_encoder = ImageEncoder()
        self.text_encoder = TextEncoder()

    def forward(self, images, input_ids, attention_mask):
        image_embeddings = self.image_encoder(images)
        text_embeddings = self.text_encoder(input_ids, attention_mask)
        return image_embeddings, text_embeddings

In [8]:
def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    image = Image.open(image_path).convert('RGB')
    return transform(image)

def preprocess_text(caption, tokenizer):
    encoding = tokenizer(caption, return_tensors='pt', padding='max_length', truncation=True, max_length=77)
    return encoding['input_ids'], encoding['attention_mask']


In [25]:
def train_model(model, dataloader, optimizer, criterion, device, save_path = "vggbert_checkpoint.pt", epochs=20):
    model.train()
    for epoch in range(epochs):
        for images, input_ids, attention_mask in dataloader:
            images = images.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            optimizer.zero_grad()
            image_embeddings, text_embeddings = model(images, input_ids, attention_mask)
            loss = criterion(image_embeddings, text_embeddings)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")
    torch.save(model.state_dict(), save_path)


In [10]:
def contrastive_loss(image_embeddings, text_embeddings, margin=0.2):
    similarity_matrix = torch.mm(image_embeddings, text_embeddings.t())
    positive_loss = 1 - similarity_matrix.diagonal()
    negative_loss = torch.relu(similarity_matrix - margin).mean()
    return positive_loss.mean() + negative_loss


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(embeddings_a, embeddings_b):
    return cosine_similarity(embeddings_a, embeddings_b)

def retrieve_top_k(similarity_scores, k=5):
    return similarity_scores.argsort(axis=-1)[:, -k:][:, ::-1]

import faiss

def build_faiss_index(embeddings):
    d = embeddings.shape[1] 
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)
    return index

def faiss_retrieve_top_k(index, query_embeddings, k=5):
    distances, indices = index.search(query_embeddings, k)
    return indices

In [12]:
def demo_cross_modal_retrieval(image_embeddings, text_embeddings, image_files, captions, faiss_index, tokenizer, model, device, k=5):
    print("Enter a query (image path or text):")
    query = input().strip()

    if os.path.exists(query):  
        query_image = preprocess_image(query).unsqueeze(0).to(device)
        with torch.no_grad():
            query_embedding = model.image_encoder(query_image).cpu().numpy()
        
        distances, indices = faiss_index.search(query_embedding, k)
        print("\nTop Text Matches for Image Query:")
        for idx in indices[0]:
            print(f"- {captions[idx]}")

        cosine_scores = compute_similarity(query_embedding, text_embeddings)
        top_k_indices = retrieve_top_k(cosine_scores, k=k)
        print("\nCosine Similarity Matches for Image Query:")
        for idx in top_k_indices[0]:
            print(f"- {captions[idx]}")

    else: 
        input_ids, attention_mask = preprocess_text(query, tokenizer)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        with torch.no_grad():
            query_embedding = model.text_encoder(input_ids, attention_mask).cpu().numpy()

        distances, indices = faiss_index.search(query_embedding, k)
        print("\nTop Image Matches for Text Query:")
        for idx in indices[0]:
            print(f"- {image_files[idx]}")

        cosine_scores = compute_similarity(query_embedding, image_embeddings)
        top_k_indices = retrieve_top_k(cosine_scores, k=k)
        print("\nCosine Similarity Matches for Text Query:")
        for idx in top_k_indices[0]:
            print(f"- {image_files[idx]}")


In [13]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

In [14]:
from torch.utils.data import Dataset

class CrossModalDataset(Dataset):
    def __init__(self, filepaths, captions, tokenizer):
        self.filepaths = filepaths
        self.captions = captions
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        image = preprocess_image(self.filepaths[idx])
        encoding = self.tokenizer(
            self.captions[idx],
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=77
        )
        input_ids = encoding['input_ids'].squeeze(0)  
        attention_mask = encoding['attention_mask'].squeeze(0) 
        return image, input_ids, attention_mask


In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [16]:
train_dataset = CrossModalDataset(train_files, train_captions, tokenizer)
val_dataset = CrossModalDataset(val_files, val_captions, tokenizer)
test_dataset = CrossModalDataset(test_files, test_captions, tokenizer)

In [17]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [19]:
model = CrossModalModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = contrastive_loss



In [26]:
train_model(model, train_loader, optimizer, criterion, device,epochs=10)

Epoch 1/10, Loss: 0.799903392791748
Epoch 2/10, Loss: 0.8003078699111938
Epoch 3/10, Loss: 0.7999312877655029
Epoch 4/10, Loss: 0.800494909286499
Epoch 5/10, Loss: 0.7995140552520752
Epoch 6/10, Loss: 0.8004022836685181
Epoch 7/10, Loss: 0.8001440763473511
Epoch 8/10, Loss: 0.8000997304916382
Epoch 9/10, Loss: 0.8024801015853882
Epoch 10/10, Loss: 0.8008776307106018


In [27]:
model

CrossModalModel(
  (image_encoder): ImageEncoder(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(i

In [28]:
import torch
import numpy as np

# Load model and tokenizer
model.eval()

def extract_embeddings(dataset, model, device):
    image_embeddings = []
    text_embeddings = []
    for images, input_ids, attention_mask in DataLoader(dataset, batch_size=16):
        with torch.no_grad():
            images = images.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            img_emb, txt_emb = model(images, input_ids, attention_mask)
            image_embeddings.append(img_emb.cpu().numpy())
            text_embeddings.append(txt_emb.cpu().numpy())
    image_embeddings = np.vstack(image_embeddings)
    text_embeddings = np.vstack(text_embeddings)
    return image_embeddings, text_embeddings

image_embeddings, text_embeddings = extract_embeddings(test_dataset, model, device)

cosine_scores = compute_similarity(image_embeddings, text_embeddings)
cosine_average = np.mean(cosine_scores)

faiss_index = build_faiss_index(text_embeddings)
faiss_scores = []
for img_emb in image_embeddings:
    img_emb = img_emb.reshape(1, -1)
    distances, indices = faiss_index.search(img_emb, k=text_embeddings.shape[0])
    similarity = 1 / (1 + distances)
    faiss_scores.append(similarity.flatten())

faiss_scores = np.array(faiss_scores)
faiss_average = np.mean(faiss_scores)

np.save("cosine_scores.npy", cosine_scores)
np.save("faiss_scores.npy", faiss_scores)

print("Average Cosine Similarity:", cosine_average)
print("Average FAISS Similarity:", faiss_average)


Average Cosine Similarity: 0.6095003
Average FAISS Similarity: 0.58273697


In [28]:
demo_cross_modal_retrieval(image_embeddings, text_embeddings, test_files, test_captions, faiss_index, tokenizer, model, device, k=5)

Enter a query (image path or text):

Top Image Matches for Text Query:
- c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\2397.png
- c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\312.png
- c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\545.png
- c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\3509.png
- c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\2881.png

Cosine Similarity Matches for Text Query:
- c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\3285.png
- c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\1651.png
- c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\1574.png
- c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\729.png
- c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\3787.png


In [29]:
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import faiss

# Cosine Similarity
def compute_cosine_similarity(image_embeddings, text_embeddings):
    return torch.tensor(cosine_similarity(image_embeddings, text_embeddings))

# FAISS Similarity
def compute_faiss_similarity(image_embeddings, text_embeddings, k=5):
    image_embeddings_np = image_embeddings.astype('float32')
    text_embeddings_np = text_embeddings.astype('float32')
    index = faiss.IndexFlatIP(text_embeddings_np.shape[1])
    index.add(text_embeddings_np)
    distances, indices = index.search(image_embeddings_np, k)
    return distances, indices

# Mean Reciprocal Rank (MRR)
def compute_mrr(similarity_matrix):
    similarity_ranks = similarity_matrix.argsort(axis=-1)[:, ::-1]
    reciprocal_ranks = []
    for i in range(len(similarity_matrix)):
        rank = np.where(similarity_ranks[i] == i)[0][0] + 1
        reciprocal_ranks.append(1 / rank)
    return np.mean(reciprocal_ranks)

# Retrieval Accuracy
def compute_retrieval_accuracy(similarity_matrix, k=5):
    top_k_indices = np.argsort(-similarity_matrix, axis=-1)[:, :k]
    correct = sum([i in top_k_indices[i] for i in range(len(similarity_matrix))])
    return correct / len(similarity_matrix)

# Cross-Modal Retrieval
def cross_modal_retrieval(image_embeddings, text_embeddings, image_paths, captions, k=5):
    similarity_matrix = cosine_similarity(image_embeddings, text_embeddings)
    
    # Image-to-Text Retrieval
    print("\nImage-to-Text Retrieval:")
    for i in range(len(image_embeddings)):
        top_text_indices = np.argsort(-similarity_matrix[i])[:k]
        print(f"Image {i + 1}: {image_paths[i]}")
        for rank, idx in enumerate(top_text_indices):
            print(f"  Rank {rank + 1}: {captions[idx]}")

    # Text-to-Image Retrieval
    print("\nText-to-Image Retrieval:")
    for i in range(len(text_embeddings)):
        top_image_indices = np.argsort(-similarity_matrix[:, i])[:k]
        print(f"Text {i + 1}: {captions[i]}")
        for rank, idx in enumerate(top_image_indices):
            print(f"  Rank {rank + 1}: {image_paths[idx]}")

# Metric Evaluation
def evaluate_vgg_bert_model(image_embeddings, text_embeddings, image_paths, captions, k=5):
    similarity_matrix = cosine_similarity(image_embeddings, text_embeddings)

    # Metrics
    cosine_avg = similarity_matrix.mean()
    mrr = compute_mrr(similarity_matrix)
    accuracy = compute_retrieval_accuracy(similarity_matrix, k)
    
    print(f"Average Cosine Similarity: {cosine_avg:.4f}")
    print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
    print(f"Retrieval Accuracy@{k}: {accuracy:.4f}")

    # Cross-modal retrieval demo
    cross_modal_retrieval(image_embeddings, text_embeddings, image_paths, captions, k)


In [30]:
image_embeddings, text_embeddings = extract_embeddings(test_dataset, model, device)

In [31]:
evaluate_vgg_bert_model(image_embeddings, text_embeddings, test_files, test_captions, k=5)

Average Cosine Similarity: 0.6095
Mean Reciprocal Rank (MRR): 0.0158
Retrieval Accuracy@5: 0.0118

Image-to-Text Retrieval:
Image 1: c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\972.png
  Rank 1: photo melanoma, foot, malignant tumor, pigmented, deposit, irregular, redness melanoma
  Rank 2: melanocytic nevus
  Rank 3: pigmented, lesion sol, symmetrical, melanocytic nevus, skin lesion, skin, melanocytic nevus
  Rank 4: black patch thickened skin, melanoma
  Rank 5: malignant melanoma, white patch, mole, irregular, pigmentation inflammatory reaction
Image 2: c:/Users/bhav1/Downloads/dermavqa/SkinCAP/skincap\1016.png
  Rank 1: photo melanoma, foot, malignant tumor, pigmented, deposit, irregular, redness melanoma
  Rank 2: melanocytic nevus
  Rank 3: pigmented, lesion sol, symmetrical, melanocytic nevus, skin lesion, skin, melanocytic nevus
  Rank 4: black patch thickened skin, melanoma
  Rank 5: malignant melanoma, white patch, mole, irregular, pigmentation inflammatory reaction
Ima

In [33]:
from sklearn.metrics import roc_auc_score
import numpy as np

def compute_auc_from_embeddings(image_embeddings, text_embeddings, num_samples):
   
    similarity_matrix = np.dot(image_embeddings, text_embeddings.T)
    similarity_matrix /= np.linalg.norm(image_embeddings, axis=1, keepdims=True)
    similarity_matrix /= np.linalg.norm(text_embeddings, axis=1, keepdims=True)
    
    ground_truth = np.eye(num_samples)
    
    similarity_flat = similarity_matrix.flatten()
    ground_truth_flat = ground_truth.flatten()

    auc_score = roc_auc_score(ground_truth_flat, similarity_flat)
    return auc_score

num_samples = len(image_embeddings) 
auc_score = compute_auc_from_embeddings(image_embeddings, text_embeddings, num_samples)

print(f"AUC Score: {auc_score:.4f}")


AUC Score: 0.5000
