In [None]:
from turtle import down
import torch
from PIL import Image
import clip
from torchvision import transforms
import os
import numpy as np


import sys
from matplotlib import gridspec
from matplotlib import pyplot as plt
sys.path.append('src')
from score_util_pub import *
from inference import *
import json


In [None]:
model = Generator('sdxl-light-1')

amplification_factor = [1.0]*7

# You can manually select amplification factor for each block 

amplification_factor[0] = 1.1 # amp factor for down 0 block
amplification_factor[1] = 1.3 # amp factor for down 1 block
amplification_factor[2] = 1.6 # amp factor for down 2 block
amplification_factor[3] = 1.8 # amp factor for middle block

# Or you can use the automatically found params
# Please note that the automatically found params are searched for each block 
# and needs to be further scaled to be used in combination.

# file_path = f'./results/sdxl-light-1/house/amp_factors_80.json'
file_path = f'./results/amp_factors_80.json'
with open(file_path, 'r') as file:
    data = json.load(file)
amplification_factor[0] = 1+(data[0][0]-1)*0.2
amplification_factor[1] = 1+(data[0][1]-1)*0.2
amplification_factor[2] = 1+(data[0][2]-1)*0.1
amplification_factor[3] = 1+(data[0][3]-1)*0.1

# Generate Images

In [None]:
obj = "house"
for seed in range(9):
    prompt = f'a creative {obj}'
    save_path = f'./results/{obj}/seed_{seed}/orig/{seed}.jpg'
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    orig = model.orig(prompt=prompt, seed=seed, save_path=save_path)
    
    save_path = f'./results/{obj}/seed_{seed}/c3/{seed}.jpg'
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    c3 = model.c3(prompt=prompt, seed=seed, replace_mask=amplification_factor, cutoff=[10.0,5.0,5.0,5.0,1.0,1.0,1.0], save_path=save_path)
    
    save_path = f'./results/{obj}/seed_{seed}/up/{seed}.jpg'
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    up_trans = model.dual_stage(prompt=prompt, seed=seed, replace_mask=amplification_factor, cutoff=[10.0,5.0,5.0,5.0,1.0,1.0,1.0], filter_factor=0.8, saliency_fft=False, save_path=save_path)
    
    save_path = f'./results/{obj}/seed_{seed}/down/{seed}.jpg'
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    down_saliency = model.dual_stage(prompt=prompt, seed=seed, replace_mask=amplification_factor, cutoff=[10.0,5.0,5.0,5.0,1.0,1.0,1.0], apply_filter=False, filter_factor=0.8, saliency_fft=True, save_path=save_path)
    
    save_path = f'./results/{obj}/seed_{seed}/ours/{seed}.jpg'
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    dual = model.dual_stage(prompt=prompt, seed=seed, replace_mask=amplification_factor,cutoff=[10.0,5.0,5.0,5.0,1.0,1.0,1.0], filter_factor=0.8, save_path=save_path)

In [4]:
orig_images = []
c3_images = []
up_images = []
down_images = []
ours_images = []

# create image lists
for seed in range(9):
    # orig
    image_dir = f'./results/{obj}/seed_{seed}/orig/{seed}.jpg'
    img_pil = Image.open(image_dir)
    orig_images.append(img_pil)
    
    # c3
    image_dir = f'./results/{obj}/seed_{seed}/c3/{seed}.jpg'
    img_pil = Image.open(image_dir)
    c3_images.append(img_pil)
    
    # up
    image_dir = f'./results/{obj}/seed_{seed}/up/{seed}.jpg'
    img_pil = Image.open(image_dir)
    up_images.append(img_pil)
    
    # down
    image_dir = f'./results/{obj}/seed_{seed}/down/{seed}.jpg'
    img_pil = Image.open(image_dir)
    down_images.append(img_pil)
    
    #ours
    image_dir = f'./results/{obj}/seed_{seed}/ours/{seed}.jpg'
    img_pil = Image.open(image_dir)
    ours_images.append(img_pil)

# Clip

In [5]:
# CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device)

# 输入文本
text_prompt = f'a creative {obj}'
text_tokens = clip.tokenize(text_prompt).to(device)
with torch.no_grad():
    text_feat = model.encode_text(text_tokens)
    text_feat /= text_feat.norm(dim=-1, keepdim=True)
    

for seed in range(9):
    
    clip_orig = []
    clip_c3 = []
    clip_up = []
    clip_down = []
    clip_ours = []

    # orig
    for img in orig_images:
        image = img.convert("RGB")
        image_input = preprocess(image).unsqueeze(0).to(device)

        with torch.no_grad():
            img_feat = model.encode_image(image_input)
            img_feat /= img_feat.norm(dim=-1, keepdim=True)

        score = torch.matmul(img_feat, text_feat.T).item()
        clip_orig.append(score)
    
    # c3
    for img in c3_images:
        image = img.convert("RGB")
        image_input = preprocess(image).unsqueeze(0).to(device)

        with torch.no_grad():
            img_feat = model.encode_image(image_input)
            img_feat /= img_feat.norm(dim=-1, keepdim=True)

        score = torch.matmul(img_feat, text_feat.T).item()
        clip_c3.append(score)
        
    # up
    for img in up_images:
        image = img.convert("RGB")
        image_input = preprocess(image).unsqueeze(0).to(device)

        with torch.no_grad():
            img_feat = model.encode_image(image_input)
            img_feat /= img_feat.norm(dim=-1, keepdim=True)

        score = torch.matmul(img_feat, text_feat.T).item()
        clip_up.append(score)
        
    # down
    for img in down_images:
        image = img.convert("RGB")
        image_input = preprocess(image).unsqueeze(0).to(device)

        with torch.no_grad():
            img_feat = model.encode_image(image_input)
            img_feat /= img_feat.norm(dim=-1, keepdim=True)

        score = torch.matmul(img_feat, text_feat.T).item()
        clip_down.append(score)
    
    # orig
    for img in ours_images:
        image = img.convert("RGB")
        image_input = preprocess(image).unsqueeze(0).to(device)

        with torch.no_grad():
            img_feat = model.encode_image(image_input)
            img_feat /= img_feat.norm(dim=-1, keepdim=True)

        score = torch.matmul(img_feat, text_feat.T).item()
        clip_ours.append(score)
        
    
    

print("\n====== CLIP Scores Orig ======")
print(f'mean: {np.mean(clip_orig):.4f}')
print(f'std: {np.std(clip_orig):.4f}')

print("\n====== CLIP Scores c3 ======")
print(f'mean: {np.mean(clip_c3):.4f}')
print(f'std: {np.std(clip_c3):.4f}')

print("\n====== CLIP Scores up ======")
print(f'mean: {np.mean(clip_up):.4f}')
print(f'std: {np.std(clip_up):.4f}')

print("\n====== CLIP Scores down ======")
print(f'mean: {np.mean(clip_down):.4f}')
print(f'std: {np.std(clip_down):.4f}')

print("\n====== CLIP Scores ours ======")
print(f'mean: {np.mean(clip_ours):.4f}')
print(f'std: {np.std(clip_ours):.4f}')


mean: 0.2817
std: 0.0165

mean: 0.2815
std: 0.0105

mean: 0.2865
std: 0.0088

mean: 0.2823
std: 0.0102

mean: 0.2872
std: 0.0103


# Lpips

In [6]:
import lpips

In [7]:
# Load LPIPS model (alex / vgg / squeeze)
loss_fn = lpips.LPIPS(net='alex').cuda()  # alex 
to_tensor = transforms.ToTensor()

dist_c3 = []

dist_up = []

dist_down = []

dist_ours = []

for i in range(9):
    img1 = to_tensor(orig_images[i]).unsqueeze(0).cuda()
    img2 = to_tensor(c3_images[i]).unsqueeze(0).cuda()
    
    # Normalize to [-1, 1] for LPIPS
    img1 = (img1 - 0.5) * 2
    img2 = (img2 - 0.5) * 2
    # Compute LPIPS
    dist = loss_fn(img1, img2)
    dist_c3.append(dist.item())

print("\n====== Lpips Scores C3 ======")   
print(f'mean: {np.mean(dist_c3):.4f}')
print(f'std: {np.std(dist_c3):.4f}')

for i in range(9):
    img1 = to_tensor(orig_images[i]).unsqueeze(0).cuda()
    img2 = to_tensor(up_images[i]).unsqueeze(0).cuda()
    
    # Normalize to [-1, 1] for LPIPS
    img1 = (img1 - 0.5) * 2
    img2 = (img2 - 0.5) * 2
    # Compute LPIPS
    dist = loss_fn(img1, img2)
    dist_up.append(dist.item())

print("\n====== Lpips Scores Up ======")    
print(f'mean: {np.mean(dist_up):.4f}')
print(f'std: {np.std(dist_up):.4f}')

# down
for i in range(9):
    img1 = to_tensor(orig_images[i]).unsqueeze(0).cuda()
    img2 = to_tensor(down_images[i]).unsqueeze(0).cuda()
    
    # Normalize to [-1, 1] for LPIPS
    img1 = (img1 - 0.5) * 2
    img2 = (img2 - 0.5) * 2
    # Compute LPIPS
    dist = loss_fn(img1, img2)
    dist_down.append(dist.item())

print("\n====== Lpips Scores Down ======")      
print(f'mean: {np.mean(dist_down):.4f}')
print(f'std: {np.std(dist_down):.4f}')


# ours
for i in range(9):
    img1 = to_tensor(orig_images[i]).unsqueeze(0).cuda()
    img2 = to_tensor(ours_images[i]).unsqueeze(0).cuda()
    
    # Normalize to [-1, 1] for LPIPS
    img1 = (img1 - 0.5) * 2
    img2 = (img2 - 0.5) * 2
    # Compute LPIPS
    dist = loss_fn(img1, img2)
    dist_ours.append(dist.item())

print("\n====== Lpips Scores Ours ======")     
print(f'mean: {np.mean(dist_ours):.4f}')
print(f'std: {np.std(dist_ours):.4f}')
    
# the smaller lpips, the more similar

Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]




Loading model from: /home/grads/xiaoyanzang24/miniconda3/envs/C3/lib/python3.10/site-packages/lpips/weights/v0.1/alex.pth


  self.load_state_dict(torch.load(model_path, map_location='cpu'), strict=False)



mean: 0.5477
std: 0.0439

mean: 0.5616
std: 0.0428

mean: 0.5126
std: 0.0377

mean: 0.5265
std: 0.0365


# Aesthetic

In [8]:
import torch
from aesthetic_predictor_v2_5 import convert_v2_5_from_siglip

# load model
amodel, a_preprocessor = convert_v2_5_from_siglip(
    low_cpu_mem_usage=True,
    trust_remote_code=True,
)
amodel = amodel.to(torch.bfloat16).cuda()
amodel.eval()


def calc_aesthetic_score(img: Image.Image) -> float:
    # preprocess：convert img to pixel_values
    inputs = a_preprocessor(images=img.convert("RGB"), return_tensors="pt")
    pixel_values = inputs.pixel_values.to(torch.bfloat16).cuda()

    # forward
    with torch.no_grad(), torch.inference_mode():
        score = amodel(pixel_values).logits.squeeze().float().cpu().item()
    return score

In [9]:
aes_scores_orig = []
for i, img in enumerate(orig_images):   # image_list: [PIL.Image, PIL.Image, ...]
    score = calc_aesthetic_score(img)
    aes_scores_orig.append(score)


# mean and std
print("\n===== Orig Aesthetic Score Summary =====")
print(f"Mean: {np.mean(aes_scores_orig):.4f}")
print(f"Std:  {np.std(aes_scores_orig):.4f}")

aes_scores_c3 = []
for i, img in enumerate(c3_images):   # image_list: [PIL.Image, PIL.Image, ...]
    score = calc_aesthetic_score(img)
    aes_scores_orig.append(score)


# mean and std
print("\n===== C3 Aesthetic Score Summary =====")
print(f"Mean: {np.mean(aes_scores_orig):.4f}")
print(f"Std:  {np.std(aes_scores_orig):.4f}")

aes_scores_up = []
for i, img in enumerate(up_images):   # image_list: [PIL.Image, PIL.Image, ...]
    score = calc_aesthetic_score(img)
    aes_scores_up.append(score)


# mean and std
print("\n===== Up Aesthetic Score Summary =====")
print(f"Mean: {np.mean(aes_scores_up):.4f}")
print(f"Std:  {np.std(aes_scores_up):.4f}")

aes_scores_down = []
for i, img in enumerate(down_images):   # image_list: [PIL.Image, PIL.Image, ...]
    score = calc_aesthetic_score(img)
    aes_scores_down.append(score)


# mean and std
print("\n===== Down Aesthetic Score Summary =====")
print(f"Mean: {np.mean(aes_scores_down):.4f}")
print(f"Std:  {np.std(aes_scores_down):.4f}")

aes_scores_ours = []
for i, img in enumerate(ours_images):   # image_list: [PIL.Image, PIL.Image, ...]
    score = calc_aesthetic_score(img)
    aes_scores_ours.append(score)


# mean and std
print("\n===== Ours Aesthetic Score Summary =====")
print(f"Mean: {np.mean(aes_scores_ours):.4f}")
print(f"Std:  {np.std(aes_scores_ours):.4f}")


===== Orig Aesthetic Score Summary =====
Mean: 4.7743
Std:  0.2455

===== C3 Aesthetic Score Summary =====
Mean: 4.9653
Std:  0.3107

===== Up Aesthetic Score Summary =====
Mean: 4.9201
Std:  0.1510

===== Down Aesthetic Score Summary =====
Mean: 5.1042
Std:  0.1667

===== Ours Aesthetic Score Summary =====
Mean: 4.9410
Std:  0.1708
