In [None]:
import torch
import json
import clip
from PIL import Image
import os
from diffusers import DiffusionPipeline
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
import torch
import numpy as np
from transformers import CLIPProcessor, CLIPModel
from torchmetrics.functional.multimodal import clip_score
from functools import partial
from pipeline_rf import RectifiedFlowPipeline
import random
import hpsv2
from diffusers import AutoPipelineForText2Image

random.seed(2024)
device = "cuda:1"

### load model

In [4]:
clip.available_models()
clip_model, clip_preprocess = clip.load('ViT-L/14@336px')
clip_model = clip_model.to(device)

In [None]:
model_id = "/data/model/stable-diffusion-2-1"
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
sd_pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, torch_dtype=torch.float16)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=True)

In [None]:
instaflow_pipe = RectifiedFlowPipeline.from_pretrained("/data/model/instaflow_0_9B_from_sd_1_5", torch_dtype=torch.float16, safety_checker=None, requires_safety_checker=False) 
### switch to torch.float32 for higher quality

instaflow_pipe.to(device)  ### if GPU is not available, comment this line
instaflow_pipe.set_progress_bar_config(disable=True)

In [2]:
sdxl_turbo_pipe = AutoPipelineForText2Image.from_pretrained("/data/model/sdxl-turbo", torch_dtype=torch.float16, variant="fp16")
sdxl_turbo_pipe.to(device)
sdxl_turbo_pipe.set_progress_bar_config(disable=True)

Loading pipeline components...: 100%|██████████| 7/7 [00:09<00:00,  1.34s/it]


In [5]:
lcm_pipe = DiffusionPipeline.from_pretrained("/data/model/LCM_Dreamshaper_v7", safety_checker=None, requires_safety_checker=False)
# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
lcm_pipe.to(torch_device=device, torch_dtype=torch.float32)
lcm_pipe.set_progress_bar_config(disable=True)

Loading pipeline components...: 100%|██████████| 6/6 [00:03<00:00,  1.58it/s]
  deprecate("torch_dtype", "0.27.0", "")
  deprecate("torch_device", "0.27.0", "")


## clip score

In [6]:
def get_clip_score(image, text):
    # Load the pre-trained CLIP model and the image

    # Preprocess the image and tokenize the text
    image_input = clip_preprocess(image).unsqueeze(0)
    text_input = clip.tokenize([text])
    
    # Move the inputs to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    image_input = image_input.to(device)
    text_input = text_input.to(device)
    
    # Generate embeddings for the image and text
    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
        text_features = clip_model.encode_text(text_input)
    
    # Normalize the features
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
    # Calculate the cosine similarity to get the CLIP score
    clip_score = torch.matmul(image_features, text_features.T).item()
    
    return clip_score

In [None]:
# Path to the folder containing your images
folder_path = "/home/liutao/workspace/distill/swift_photo_with_text"

# Initialize empty lists to store images and their names
image_list = []
image_name_list = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is an image (you can customize the extension check)
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
        # Load the image
        image_path = os.path.join(folder_path, filename)
        image = Image.open(image_path)

        # Append the image and its name to the lists
        image_list.append(image)
        image_name_list.append(filename)

# Now, image_list contains PIL Image objects, and image_name_list contains corresponding names
avg_score = 0
for i in range(len(image_list)):
    image = image_list[i]
    text = image_name_list[i]
    score = get_clip_score(image, text)
    avg_score += score
    
print(f"AVG CLIP Score: {avg_score/len(image_list)}") # CLIP score:0.300


In [None]:
# Load the .npz file
# data = np.load('/data/20231212/SwiftBrush_reproduce_final20231227/val2014_captions.npz')
# captions = data['captions'][()]
# print(len(captions),captions[0],captions[1])
# data.close()

### load coco30k_caption

In [7]:
coco_f = open('/data/dataset/coco2014-val/annotations/captions_val2014.json')
coco_annotations = json.load(coco_f)
captions = []
for annotation in coco_annotations['annotations']:
    caption = annotation['caption']
    captions.append(caption)
coco_f.close()
print(len(captions),captions[0],captions[1])

202654 A bicycle replica with a clock as the front wheel. A black Honda motorcycle parked in front of a garage.


In [8]:
captions_30k = random.choices(captions, k=30000)
print(len(captions_30k),captions_30k[0],captions_30k[1])

30000 a little green cart filled with assorted suitcases  A woman in an odd outfit on a bed


### ours

In [None]:
# Path to the folder containing your images
folder_path = "/home/liutao/workspace/data/ours_coco30k"

# Initialize empty lists to store images and their names
image_list = []
image_name_list = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is an image (you can customize the extension check)
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
        # Load the image
        image_path = os.path.join(folder_path, filename)
        image = Image.open(image_path)

        # Append the image and its name to the lists
        image_list.append(image)
        image_name_list.append(filename)

# Now, image_list contains PIL Image objects, and image_name_list contains corresponding names
avg_score = 0
for i in range(len(image_list)):
    image = image_list[i]
    text = image_name_list[i]
    score = get_clip_score(image, text)
    avg_score += score
    
print(f"AVG CLIP Score: {avg_score/len(image_list)}") 

### sdxl_turbo

In [None]:
#smallset_test >>> instaflow:0.26 sd_1_step:0.138 sd_25_step:0.22
#instaflow coco30k clip_socre: 0.2580452107747396
#sdxl_turbo_4_step coco30K clip_socre: 0.27137984619140626 
#sdxl_turbo_1_step coco30K clip_socre: 0.2724981628417969 
#lcm coco30k_4_step clip_socre: 
#lcm coco30k_1_step clip_socre: 
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    # image = instaflow_pipe(prompt=caption, num_inference_steps=1, guidance_scale=0.0).images[0] 
    # image = sd_pipe(prompt=caption, num_inference_steps=25, guidance_scale=0.0).images[0]
    # image = lcm_pipe(prompt=caption, num_inference_steps=1, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images
    image = sdxl_turbo_pipe(prompt=caption, num_inference_steps=4, guidance_scale=0.0).images[0]
    score = get_clip_score(image, caption)
    try:
        image.save("/home/liutao/workspace/data/sdxl_turbo_4_step_coco30K/"+caption+".jpg")
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

In [None]:
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    # image = instaflow_pipe(prompt=caption, num_inference_steps=1, guidance_scale=0.0).images[0] 
    # image = sd_pipe(prompt=caption, num_inference_steps=25, guidance_scale=0.0).images[0]
    # image = lcm_pipe(prompt=caption, num_inference_steps=num_inference_steps, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images
    image = sdxl_turbo_pipe(prompt=caption, num_inference_steps=1, guidance_scale=0.0).images[0]
    score = get_clip_score(image, caption)
    try:
        image.save("/home/liutao/workspace/data/sdxl_turbo_1_step_coco30k/"+caption+".jpg")
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

### lcm

In [10]:
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    image = lcm_pipe(prompt=caption, num_inference_steps=4, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images[0]
    score = get_clip_score(image, caption)
    try:
        image.save("/home/liutao/workspace/data/lcm_4_step_coco30k/"+caption+".jpg")
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

current num: 1000 current avg clip score: 0.26306097412109375
A compact home bathroom with toilet, pedestal sink, and tub/shower.
current num: 2000 current avg clip score: 0.26297000122070313
Street signs from the corner of 8th ave. and 22 3/4 st.
current num: 3000 current avg clip score: 0.2630679321289062
This tennis pro finds himself resting,  leaning against the advertising graphics/sign.
current num: 4000 current avg clip score: 0.2627286529541016
A lady is trying to go ice skating/skiing. 
current num: 5000 current avg clip score: 0.26308931884765624
There is a white cow/bull in front of a white building with purple trim.
current num: 6000 current avg clip score: 0.26321371459960935
a red and yellow train is going past some red lights/train signals
A living room with a red/brown area rug, two couches and a large flat screen TV.
current num: 7000 current avg clip score: 0.26300069754464284
current num: 8000 current avg clip score: 0.26325895690917966
current num: 9000 current avg 

In [None]:
## lcm_1_step: 0.21957598673502604
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    image = lcm_pipe(prompt=caption, num_inference_steps=1, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images[0]
    score = get_clip_score(image, caption)
    try:
        image.save("/home/liutao/workspace/data/lcm_1_step_coco30K/"+caption+".jpg")
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

## hpsv2 score

In [4]:
# Get benchmark prompts (<style> = all, anime, concept-art, paintings, photo)
all_prompts = hpsv2.benchmark_prompts('all') 
path = "/home/liutao/workspace/data/sdxl_turbo_1_step_hpsv2"
# Iterate over the benchmark prompts to generate images
# for style, prompts in all_prompts.items():
#     for idx, prompt in enumerate(prompts):
#         # image = instaflow_pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0] 
#         # image = sd_pipe(prompt=prompt, num_inference_steps=25, guidance_scale=0.0).images[0]
#         image = sdxl_turbo_pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]
#         # TextToImageModel is the model you want to evaluate
#         image.save(os.path.join(path, style, f"{idx:05d}.jpg")) 
#         # <image_path> is the folder path to store generated images, as the input of hpsv2.evaluate().
hpsv2.evaluate(path) 

Loading model ...
Loading model successfully!




-----------benchmark score ---------------- 
sdxl_turbo_1_step_hpsv2 anime           28.47 	 0.1117
sdxl_turbo_1_step_hpsv2 concept-art     27.48 	 0.0845
sdxl_turbo_1_step_hpsv2 photo           27.60 	 0.1805
sdxl_turbo_1_step_hpsv2 paintings       27.54 	 0.1317


In [5]:
# Get benchmark prompts (<style> = all, anime, concept-art, paintings, photo)
all_prompts = hpsv2.benchmark_prompts('all') 
path = "/home/liutao/workspace/data/sdxl_turbo_4_step_hpsv2"
# Iterate over the benchmark prompts to generate images
for style, prompts in all_prompts.items():
    for idx, prompt in enumerate(prompts):
        # image = instaflow_pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0] 
        # image = sd_pipe(prompt=prompt, num_inference_steps=25, guidance_scale=0.0).images[0]
        image = sdxl_turbo_pipe(prompt=prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
        # TextToImageModel is the model you want to evaluate
        image.save(os.path.join(path, style, f"{idx:05d}.jpg")) 
        # <image_path> is the folder path to store generated images, as the input of hpsv2.evaluate().

hpsv2.evaluate(path) 

Token indices sequence length is longer than the specified maximum sequence length for this model (80 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['yamamoto.']
Token indices sequence length is longer than the specified maximum sequence length for this model (80 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['yamamoto.']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['tooth wu, wlop, beeple, and greg rutkowski.']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['tooth wu, wlop, beeple, and greg rutkowski.']


Loading model ...
Loading model successfully!
-----------benchmark score ---------------- 
sdxl_turbo_4_step_hpsv2 anime           28.67 	 0.0960
sdxl_turbo_4_step_hpsv2 concept-art     27.83 	 0.0870
sdxl_turbo_4_step_hpsv2 photo           27.86 	 0.1791
sdxl_turbo_4_step_hpsv2 paintings       27.96 	 0.1141


In [7]:
# Get benchmark prompts (<style> = all, anime, concept-art, paintings, photo)
all_prompts = hpsv2.benchmark_prompts('all') 
path = "/home/liutao/workspace/data/lcm_1_step_hpsv2"
# Iterate over the benchmark prompts to generate images
for style, prompts in all_prompts.items():
    for idx, prompt in enumerate(prompts):
        # image = instaflow_pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0] 
        # image = sd_pipe(prompt=prompt, num_inference_steps=25, guidance_scale=0.0).images[0]
        image = lcm_pipe(prompt=prompt, num_inference_steps=1, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images[0]
        # image = sdxl_turbo_pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]
        # TextToImageModel is the model you want to evaluate
        image.save(os.path.join(path, style, f"{idx:05d}.jpg")) 
        # <image_path> is the folder path to store generated images, as the input of hpsv2.evaluate().

hpsv2.evaluate(path) 

Loading model ...
Loading model successfully!




-----------benchmark score ---------------- 
lcm_1_step_hpsv2 anime           22.69 	 0.1126
lcm_1_step_hpsv2 concept-art     22.79 	 0.1477
lcm_1_step_hpsv2 photo           22.92 	 0.2112
lcm_1_step_hpsv2 paintings       22.95 	 0.1737


In [8]:
# Get benchmark prompts (<style> = all, anime, concept-art, paintings, photo)
all_prompts = hpsv2.benchmark_prompts('all') 
path = "/home/liutao/workspace/data/lcm_4_step_hpsv2"
# Iterate over the benchmark prompts to generate images
for style, prompts in all_prompts.items():
    for idx, prompt in enumerate(prompts):
        # image = instaflow_pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0] 
        # image = sd_pipe(prompt=prompt, num_inference_steps=25, guidance_scale=0.0).images[0]
        image = lcm_pipe(prompt=prompt, num_inference_steps=4, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images[0]
        # image = sdxl_turbo_pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]
        # TextToImageModel is the model you want to evaluate
        image.save(os.path.join(path, style, f"{idx:05d}.jpg")) 
        # <image_path> is the folder path to store generated images, as the input of hpsv2.evaluate().

hpsv2.evaluate(path) 

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['yamamoto.']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['tooth wu, wlop, beeple, and greg rutkowski.']


Loading model ...
Loading model successfully!
-----------benchmark score ---------------- 
lcm_4_step_hpsv2 anime           26.58 	 0.1396
lcm_4_step_hpsv2 concept-art     26.16 	 0.0640
lcm_4_step_hpsv2 photo           26.24 	 0.2264
lcm_4_step_hpsv2 paintings       26.28 	 0.1478


## fid score