In [1]:
pip install --upgrade torch-fidelity


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from pathlib import Path
from PIL import Image
import numpy as np
import torch
from torchvision import transforms
from tqdm import tqdm
from torch_fidelity import calculate_metrics

device = "cuda" if torch.cuda.is_available() else "cpu"

### Novelty

In [3]:
def list_images(folder):
    exts = {".png", ".jpg", ".jpeg", ".webp"}
    return [p for p in Path(folder).rglob("*") if p.suffix.lower() in exts]

def load_pil(path):
    return Image.open(path).convert("RGB")

def compute_fid_precision_recall(real_dir, fake_dir, batch_size=32):
    metrics = calculate_metrics(
        input1=real_dir,
        input2=fake_dir,
        cuda=torch.cuda.is_available(),
        batch_size=batch_size,
        isc=False,
        fid=True,
        kid=False,
        prc=True,
        verbose=False,
    )
    return {
        "FID": float(metrics.get("frechet_inception_distance", -1)),
        "Precision": float(metrics.get("precision", -1)),
        "Recall": float(metrics.get("recall", -1)),
    }

In [4]:
method_names = ["original", "c3", "upblock_transform", "saliency_gating", "both"]
nouns = ["chair", "car"]

from PIL import Image
import os

def resize_images(folder, size=(299, 299)):
    for filename in os.listdir(folder):
        if filename.lower().endswith((".jpg", ".jpeg", ".png", ".webp")):
            path = os.path.join(folder, filename)
            img = Image.open(path).convert("RGB").resize(size)
            img.save(path)



for noun in nouns:
    real_path = os.path.join("./dataset", noun, "test")
    resize_images(real_path, size=(299, 299))
    for method in method_names:
        print(f"=== Computing metrics for {noun}, {method} ===")
        fake_path = os.path.join("./dataset", noun, method)
        resize_images(fake_path, size=(299, 299))
        metrics = compute_fid_precision_recall(real_path, fake_path)
        print(f"Noun: {noun}, Method: {method}, FID: {metrics['FID']:.2f}, Precision: {metrics['Precision']:.4f}, Recall: {metrics['Recall']:.4f}")

=== Computing metrics for chair, original ===


  img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes())).view(height, width, 3)


Noun: chair, Method: original, FID: 106.54, Precision: -1.0000, Recall: -1.0000
=== Computing metrics for chair, c3 ===
Noun: chair, Method: c3, FID: 216.94, Precision: -1.0000, Recall: -1.0000
=== Computing metrics for chair, upblock_transform ===
Noun: chair, Method: upblock_transform, FID: 218.52, Precision: -1.0000, Recall: -1.0000
=== Computing metrics for chair, saliency_gating ===
Noun: chair, Method: saliency_gating, FID: 201.90, Precision: -1.0000, Recall: -1.0000
=== Computing metrics for chair, both ===
Noun: chair, Method: both, FID: 204.35, Precision: -1.0000, Recall: -1.0000
=== Computing metrics for car, original ===
Noun: car, Method: original, FID: 99.95, Precision: -1.0000, Recall: -1.0000
=== Computing metrics for car, c3 ===
Noun: car, Method: c3, FID: 116.58, Precision: -1.0000, Recall: -1.0000
=== Computing metrics for car, upblock_transform ===
Noun: car, Method: upblock_transform, FID: 119.88, Precision: -1.0000, Recall: -1.0000
=== Computing metrics for car, sa

### Diversity

In [9]:
## Diversity-LPIPS
import lpips
lpips_model = lpips.LPIPS(net='vgg').to(device)

def compute_lpips(fake_dir, max_pairs=2000, img_size=256):
    paths = list_images(fake_dir)
    n = len(paths)
    if n < 2:
        return 0.0

    transform = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
    ])

    # Random sampling pairs
    idx = torch.randint(0, n, (max_pairs * 2,))
    pairs = [(idx[i].item(), idx[i+1].item()) for i in range(0, len(idx), 2)]

    dists = []
    for i, j in tqdm(pairs, desc="LPIPS"):
        if i == j:
            continue
        img1 = transform(load_pil(paths[i])).unsqueeze(0).to(device)
        img2 = transform(load_pil(paths[j])).unsqueeze(0).to(device)
        with torch.no_grad():
            d = lpips_model(img1, img2)
        dists.append(d.item())

    return float(np.mean(dists))
## Diversity: Vendi Score
import clip

clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

@torch.no_grad()
def compute_vendi(fake_dir, max_images=200):
    paths = list_images(fake_dir)[:max_images]
    feats = []

    for p in tqdm(paths, desc="CLIP feats"):
        img = clip_preprocess(load_pil(p)).unsqueeze(0).to(device)
        feat = clip_model.encode_image(img)
        feat = feat / feat.norm(dim=-1, keepdim=True)
        feats.append(feat.cpu().numpy())

    feats = np.concatenate(feats, axis=0)
    K = feats @ feats.T
    K = (K + 1) / 2  # normalize to [0,1]

    K = K.astype(np.float64)

    eigvals = np.linalg.eigvalsh(K)
    eigvals = np.maximum(eigvals, 0)
    p = eigvals / eigvals.sum()

    entropy = -np.sum(p * np.log(p + 1e-12))
    return float(np.exp(entropy))

Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off]




Loading model from: /home/tai_phan/anaconda3/envs/C3/lib/python3.10/site-packages/lpips/weights/v0.1/vgg.pth


In [10]:
for noun in nouns:
    for method in method_names:
        fake_path = os.path.join("./dataset", noun, method)
        lpips_score = compute_lpips(fake_path, max_pairs=2000, img_size=299)
        vendi_score = compute_vendi(fake_path, max_images=100)
        print(f"Noun: {noun}, Method: {method}, LPIPS: {lpips_score:.4f}, Vendi: {vendi_score:.4f}")

LPIPS: 100%|██████████| 2000/2000 [00:30<00:00, 66.28it/s]
CLIP feats: 100%|██████████| 100/100 [00:01<00:00, 92.07it/s]


Noun: chair, Method: original, LPIPS: 0.4495, Vendi: 1.5611


LPIPS: 100%|██████████| 2000/2000 [00:30<00:00, 65.45it/s]
CLIP feats: 100%|██████████| 100/100 [00:01<00:00, 86.30it/s]


Noun: chair, Method: c3, LPIPS: 0.5240, Vendi: 1.9206


LPIPS: 100%|██████████| 2000/2000 [00:32<00:00, 62.25it/s]
CLIP feats: 100%|██████████| 100/100 [00:01<00:00, 81.54it/s]


Noun: chair, Method: upblock_transform, LPIPS: 0.5380, Vendi: 1.9791


LPIPS: 100%|██████████| 2000/2000 [00:31<00:00, 64.14it/s]
CLIP feats: 100%|██████████| 100/100 [00:01<00:00, 85.54it/s]


Noun: chair, Method: saliency_gating, LPIPS: 0.4994, Vendi: 1.8975


LPIPS: 100%|██████████| 2000/2000 [00:32<00:00, 61.11it/s]
CLIP feats: 100%|██████████| 100/100 [00:01<00:00, 89.37it/s]


Noun: chair, Method: both, LPIPS: 0.5214, Vendi: 1.9528


LPIPS: 100%|██████████| 2000/2000 [00:32<00:00, 60.71it/s]
CLIP feats: 100%|██████████| 100/100 [00:01<00:00, 87.99it/s]


Noun: car, Method: original, LPIPS: 0.6164, Vendi: 2.2200


LPIPS: 100%|██████████| 2000/2000 [00:32<00:00, 61.43it/s]
CLIP feats: 100%|██████████| 100/100 [00:01<00:00, 73.53it/s]


Noun: car, Method: c3, LPIPS: 0.6333, Vendi: 2.4522


LPIPS: 100%|██████████| 2000/2000 [00:32<00:00, 60.88it/s]
CLIP feats: 100%|██████████| 100/100 [00:01<00:00, 78.22it/s]


Noun: car, Method: upblock_transform, LPIPS: 0.6373, Vendi: 2.4247


LPIPS: 100%|██████████| 2000/2000 [00:32<00:00, 60.79it/s]
CLIP feats: 100%|██████████| 100/100 [00:01<00:00, 88.68it/s]


Noun: car, Method: saliency_gating, LPIPS: 0.6318, Vendi: 2.3777


LPIPS: 100%|██████████| 2000/2000 [00:33<00:00, 60.52it/s]
CLIP feats: 100%|██████████| 100/100 [00:01<00:00, 74.22it/s]

Noun: car, Method: both, LPIPS: 0.6345, Vendi: 2.3733





### Usability

In [12]:
## Usability: CLIP Score
@torch.no_grad()
def compute_clip_score(fake_dir, prompt, max_images=200):
    paths = list_images(fake_dir)[:max_images]

    text = clip.tokenize([prompt]).to(device)
    text_feat = clip_model.encode_text(text)
    text_feat = text_feat / text_feat.norm(dim=-1, keepdim=True)

    sims = []
    for p in tqdm(paths, desc="CLIP score"):
        img = clip_preprocess(load_pil(p)).unsqueeze(0).to(device)
        img_feat = clip_model.encode_image(img)
        img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
        sims.append((img_feat @ text_feat.T).item())

    return float(np.mean(sims))
## Usability: BLIP VQA
from transformers import BlipForQuestionAnswering, BlipProcessor

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
blip_model.eval()

@torch.no_grad()
def compute_blip_yes_ratio(fake_dir, obj_name, max_images=200):
    paths = list_images(fake_dir)[:max_images]
    question = f"Is this image a {obj_name}?"

    yes = 0
    for p in tqdm(paths, desc="BLIP VQA"):
        img = load_pil(p)
        inputs = blip_processor(img, question, return_tensors="pt").to(device)
        out = blip_model.generate(**inputs)
        ans = blip_processor.decode(out[0], skip_special_tokens=True).lower()
        if "yes" in ans:
            yes += 1

    return yes / len(paths)

In [13]:
for noun in nouns:
    for method in method_names:
        fake_path = os.path.join("./dataset", noun, method)
        prompt = f"a creative {noun}"
        clip_score = compute_clip_score(fake_path, prompt, max_images=100)
        blip_yes_ratio = compute_blip_yes_ratio(fake_path, noun, max_images=100)
        print(f"Noun: {noun}, Method: {method}, CLIP Score: {clip_score:.4f}, BLIP Yes Ratio: {blip_yes_ratio:.4f}")


CLIP score: 100%|██████████| 100/100 [00:01<00:00, 75.14it/s]
BLIP VQA: 100%|██████████| 100/100 [00:07<00:00, 14.00it/s]


Noun: chair, Method: original, CLIP Score: 0.2879, BLIP Yes Ratio: 0.9900


CLIP score: 100%|██████████| 100/100 [00:01<00:00, 81.52it/s]
BLIP VQA: 100%|██████████| 100/100 [00:08<00:00, 12.44it/s]


Noun: chair, Method: c3, CLIP Score: 0.2766, BLIP Yes Ratio: 0.8900


CLIP score: 100%|██████████| 100/100 [00:01<00:00, 69.08it/s]
BLIP VQA: 100%|██████████| 100/100 [00:07<00:00, 13.42it/s]


Noun: chair, Method: upblock_transform, CLIP Score: 0.2781, BLIP Yes Ratio: 0.8700


CLIP score: 100%|██████████| 100/100 [00:01<00:00, 84.38it/s]
BLIP VQA: 100%|██████████| 100/100 [00:06<00:00, 15.01it/s]


Noun: chair, Method: saliency_gating, CLIP Score: 0.2772, BLIP Yes Ratio: 0.8900


CLIP score: 100%|██████████| 100/100 [00:01<00:00, 84.75it/s]
BLIP VQA: 100%|██████████| 100/100 [00:06<00:00, 14.44it/s]


Noun: chair, Method: both, CLIP Score: 0.2792, BLIP Yes Ratio: 0.8600


CLIP score: 100%|██████████| 100/100 [00:01<00:00, 80.71it/s]
BLIP VQA: 100%|██████████| 100/100 [00:07<00:00, 12.71it/s]


Noun: car, Method: original, CLIP Score: 0.2718, BLIP Yes Ratio: 0.9700


CLIP score: 100%|██████████| 100/100 [00:01<00:00, 80.07it/s]
BLIP VQA: 100%|██████████| 100/100 [00:06<00:00, 14.63it/s]


Noun: car, Method: c3, CLIP Score: 0.2842, BLIP Yes Ratio: 1.0000


CLIP score: 100%|██████████| 100/100 [00:01<00:00, 89.14it/s]
BLIP VQA: 100%|██████████| 100/100 [00:07<00:00, 14.15it/s]


Noun: car, Method: upblock_transform, CLIP Score: 0.2870, BLIP Yes Ratio: 1.0000


CLIP score: 100%|██████████| 100/100 [00:01<00:00, 82.57it/s]
BLIP VQA: 100%|██████████| 100/100 [00:06<00:00, 15.55it/s]


Noun: car, Method: saliency_gating, CLIP Score: 0.2838, BLIP Yes Ratio: 0.9900


CLIP score: 100%|██████████| 100/100 [00:01<00:00, 85.66it/s]
BLIP VQA: 100%|██████████| 100/100 [00:08<00:00, 11.91it/s]

Noun: car, Method: both, CLIP Score: 0.2871, BLIP Yes Ratio: 0.9900



