In [None]:
import os
from pathlib import Path
from PIL import Image
import numpy as np
import torch
from torchvision import transforms
from tqdm import tqdm
from torch_fidelity import calculate_metrics

device = "cuda" if torch.cuda.is_available() else "cpu"

def list_images(folder):
    exts = {".png", ".jpg", ".jpeg", ".webp"}
    return [p for p in Path(folder).rglob("*") if p.suffix.lower() in exts]

def load_pil(path):
    return Image.open(path).convert("RGB")


In [None]:
## Novelty
def compute_fid_precision_recall(real_dir, fake_dir, batch_size=32):
    metrics = calculate_metrics(
        input1=real_dir,
        input2=fake_dir,
        cuda=torch.cuda.is_available(),
        batch_size=batch_size,
        isc=False,
        fid=True,
        kid=False,
        prc=True,
        verbose=False,
    )
    return {
        "FID": float(metrics["frechet_inception_distance"]),
        "Precision": float(metrics["precision"]),
        "Recall": float(metrics["recall"]),
    }


In [None]:
## Diversity-LPIPS
import lpips
lpips_model = lpips.LPIPS(net='vgg').to(device)

def compute_lpips(fake_dir, max_pairs=2000, img_size=256):
    paths = list_images(fake_dir)
    n = len(paths)
    if n < 2:
        return 0.0

    transform = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
    ])

    # Random sampling pairs
    idx = torch.randint(0, n, (max_pairs * 2,))
    pairs = [(idx[i].item(), idx[i+1].item()) for i in range(0, len(idx), 2)]

    dists = []
    for i, j in tqdm(pairs, desc="LPIPS"):
        if i == j:
            continue
        img1 = transform(load_pil(paths[i])).unsqueeze(0).to(device)
        img2 = transform(load_pil(paths[j])).unsqueeze(0).to(device)
        with torch.no_grad():
            d = lpips_model(img1, img2)
        dists.append(d.item())

    return float(np.mean(dists))

In [None]:
## Diversity: Vendi Score
import clip

clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

@torch.no_grad()
def compute_vendi(fake_dir, max_images=200):
    paths = list_images(fake_dir)[:max_images]
    feats = []

    for p in tqdm(paths, desc="CLIP feats"):
        img = clip_preprocess(load_pil(p)).unsqueeze(0).to(device)
        feat = clip_model.encode_image(img)
        feat = feat / feat.norm(dim=-1, keepdim=True)
        feats.append(feat.cpu().numpy())

    feats = np.concatenate(feats, axis=0)
    K = feats @ feats.T
    K = (K + 1) / 2  # normalize to [0,1]

    eigvals = np.linalg.eigvalsh(K)
    eigvals = np.maximum(eigvals, 0)
    p = eigvals / eigvals.sum()

    entropy = -np.sum(p * np.log(p + 1e-12))
    return float(np.exp(entropy))

In [None]:
## Usability: CLIP Score
@torch.no_grad()
def compute_clip_score(fake_dir, prompt, max_images=200):
    paths = list_images(fake_dir)[:max_images]

    text = clip.tokenize([prompt]).to(device)
    text_feat = clip_model.encode_text(text)
    text_feat = text_feat / text_feat.norm(dim=-1, keepdim=True)

    sims = []
    for p in tqdm(paths, desc="CLIP score"):
        img = clip_preprocess(load_pil(p)).unsqueeze(0).to(device)
        img_feat = clip_model.encode_image(img)
        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
        sims.append((img_feat @ text_feat.T).item())

    return float(np.mean(sims))

In [None]:
## Usability: BLIP VQA
from transformers import BlipForQuestionAnswering, BlipProcessor

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
blip_model.eval()

@torch.no_grad()
def compute_blip_yes_ratio(fake_dir, obj_name, max_images=200):
    paths = list_images(fake_dir)[:max_images]
    question = f"Is this image a {obj_name}?"

    yes = 0
    for p in tqdm(paths, desc="BLIP VQA"):
        img = load_pil(p)
        inputs = blip_processor(img, question, return_tensors="pt").to(device)
        out = blip_model.generate(**inputs)
        ans = blip_processor.decode(out[0], skip_special_tokens=True).lower()
        if "yes" in ans:
            yes += 1

    return yes / len(paths)

Modify the path in the method according to the directory structure.

In [None]:
def evaluate_model(
    real_dir="path",
    fake_dir="path",
    obj_name="chair",
):
    creative_prompt = f"a creative {obj_name}"

    print("=== Novelty ===")
    novelty = compute_fid_precision_recall(real_dir, fake_dir)
    print(novelty)

    print("=== Diversity ===")
    lp = compute_lpips(fake_dir)
    vd = compute_vendi(fake_dir)
    print({"LPIPS": lp, "Vendi": vd})

    print("=== Usability ===")
    cs = compute_clip_score(fake_dir, creative_prompt)
    bl = compute_blip_yes_ratio(fake_dir, obj_name)
    print({"CLIP": cs, "BLIP": bl})

    return {
        **novelty,
        "LPIPS": lp,
        "Vendi": vd,
        "CLIP": cs,
        "BLIP": bl,
    }
