In [None]:
import torch
import pandas as pd
from pathlib import Path
from PIL import Image
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel
from peft import PeftModel

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DATA_DIR = Path("../data")
BASE = "openai/clip-vit-base-patch32"

processor = CLIPProcessor.from_pretrained(BASE)
base_model = CLIPModel.from_pretrained(BASE).to(DEVICE).eval()
lora_model = PeftModel.from_pretrained(CLIPModel.from_pretrained(BASE), "../models/clip_lora/best_model").to(DEVICE).eval()

test_df = pd.read_csv(DATA_DIR / "test.csv")
valid_paths, valid_landmarks = [], []
for _, row in test_df.iterrows():
    p = DATA_DIR / "images" / row["image_path"]
    if p.exists():
        valid_paths.append(p)
        valid_landmarks.append(row["landmark_name"])

In [None]:
def embed_images(model, is_lora):
    embeds = []
    for i in tqdm(range(0, len(valid_paths), 32)):
        imgs = [Image.open(p).convert("RGB") for p in valid_paths[i:i+32]]
        inputs = {k: v.to(DEVICE) for k, v in processor(images=imgs, return_tensors="pt").items()}
        with torch.no_grad():
            if is_lora:
                e = model.base_model.get_image_features(pixel_values=inputs["pixel_values"])
            else:
                e = model.get_image_features(**inputs)
            embeds.append((e / e.norm(dim=-1, keepdim=True)).cpu())
    return torch.cat(embeds)

base_embeds = embed_images(base_model, False)
lora_embeds = embed_images(lora_model, True)

100%|██████████| 176/176 [02:20<00:00,  1.25it/s]
100%|██████████| 176/176 [02:20<00:00,  1.25it/s]


In [None]:
def evaluate(model, img_embeds, is_lora, n=500):
    samples = test_df[test_df["landmark_name"].isin(set(valid_landmarks))].sample(n=n, random_state=42)
    hits = {1: 0, 5: 0, 10: 0}
    
    for _, row in tqdm(samples.iterrows(), total=n):
        inputs = {k: v.to(DEVICE) for k, v in processor(text=[row["description"]], return_tensors="pt", padding=True, truncation=True).items()}
        with torch.no_grad():
            if is_lora:
                t = model.base_model.get_text_features(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
            else:
                t = model.get_text_features(**inputs)
            t = t / t.norm(dim=-1, keepdim=True)
        
        sims = (t.cpu() @ img_embeds.T).squeeze()
        
        scores = {}
        for idx, s in enumerate(sims):
            scores.setdefault(valid_landmarks[idx], []).append(s.item())
        
        landmark_avg = {lm: sum(v)/len(v) for lm, v in scores.items()}
        ranked = sorted(landmark_avg, key=landmark_avg.get, reverse=True)
        
        for k in hits:
            if row["landmark_name"] in ranked[:k]:
                hits[k] += 1
    
    return {k: round(100 * v / n, 1) for k, v in hits.items()}

base_res = evaluate(base_model, base_embeds, False)
lora_res = evaluate(lora_model, lora_embeds, True)

print("Base CLIP:")
print(f"  Top-1: {base_res[1]}%, Top-5: {base_res[5]}%, Top-10: {base_res[10]}%")
print("LoRA CLIP:")
print(f"  Top-1: {lora_res[1]}%, Top-5: {lora_res[5]}%, Top-10: {lora_res[10]}%")
print("Improvement:")
print(f"  Top-1: +{lora_res[1] - base_res[1]}%, Top-5: +{lora_res[5] - base_res[5]}%, Top-10: +{lora_res[10] - base_res[10]}%")

100%|██████████| 500/500 [00:06<00:00, 74.99it/s]
100%|██████████| 500/500 [00:07<00:00, 69.63it/s]

Base CLIP:
  Top-1: 30.8%, Top-5: 57.0%, Top-10: 69.0%
LoRA CLIP:
  Top-1: 33.2%, Top-5: 61.2%, Top-10: 70.6%
Improvement:
  Top-1: +2.400000000000002%, Top-5: +4.200000000000003%, Top-10: +1.5999999999999943%



