In [None]:
import torch
import json
import clip
from PIL import Image
import os
from diffusers import DiffusionPipeline
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
import torch
from pipeline_rf import RectifiedFlowPipeline
import random
from diffusers import AutoPipelineForText2Image
import json
import generate_swift as gs
from generate import generate_single_image, load_model

random.seed(2024)
device = "cuda"

### load model

In [4]:
clip_model, clip_preprocess = clip.load('ViT-L/14@336px')
clip_model = clip_model.to("cuda")

In [None]:
vae, tokenizer, text_encoder, unet, alphas = gs.load_model("/data/", "/data/20231212/SwiftBrush_reproduce_final20231227/checkpoints/vsd_global_step4000.pth")
swift_hpsv2_path = "/data/liutao/data/swift_hpsv2"
swift_coco_path = "/data/liutao/data/swift_coco30k"

In [5]:
vae, tokenizer, text_encoder, unet, scheduler, alphas = load_model("/data/", "/data/20231212/SwiftBrush_reproduce_se_parallel/checkpoints_20240228/vsd_global_step36000_8nis.pth", device)
ours_steps = 8
ours_hpsv2_path = "/data/liutao/data/ours_36k_8i_hpsv2"
ours_coco_path = "/data/liutao/data/ours_36k_8s_coco"

[INFO] loading student unet checkpoint


  torch.utils._pytree._register_pytree_node(


In [None]:
model_id = "/data/model/stable-diffusion-2-1"
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
sd_pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, torch_dtype=torch.float16)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=True)

In [None]:
instaflow_pipe = RectifiedFlowPipeline.from_pretrained("/data/model/instaflow_0_9B_from_sd_1_5", torch_dtype=torch.float32, safety_checker=None, requires_safety_checker=False) 
### switch to torch.float32 for higher quality

instaflow_pipe.to(device)  ### if GPU is not available, comment this line
instaflow_pipe.set_progress_bar_config(disable=True)

In [None]:
sdxl_turbo_pipe = AutoPipelineForText2Image.from_pretrained("/data/model/sdxl-turbo", torch_dtype=torch.float16, variant="fp16")
sdxl_turbo_pipe.to(device)
sdxl_turbo_pipe.set_progress_bar_config(disable=True)

In [None]:
lcm_pipe = DiffusionPipeline.from_pretrained("/data/model/LCM_Dreamshaper_v7", safety_checker=None, requires_safety_checker=False)
# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
lcm_pipe.to(torch_device=device, torch_dtype=torch.float32)
lcm_pipe.set_progress_bar_config(disable=True)

## clip score

In [7]:
def get_clip_score(image, text):
    # Load the pre-trained CLIP model and the image

    # Preprocess the image and tokenize the text
    image_input = clip_preprocess(image).unsqueeze(0)
    text_input = clip.tokenize([text], truncate=True)
    
    # Move the inputs to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    image_input = image_input.to(device)
    text_input = text_input.to(device)
    
    # Generate embeddings for the image and text
    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
        text_features = clip_model.encode_text(text_input)
    
    # Normalize the features
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
    # Calculate the cosine similarity to get the CLIP score
    clip_score = torch.matmul(image_features, text_features.T).item()
    
    return clip_score

### image with prompt test

In [None]:
# Path to the folder containing your images
folder_path = "/home/liutao/workspace/data/ours_coco30k_test"

total_score = 0
count = 0

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is an image (you can customize the extension check)
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
        # Load the image
        image_path = os.path.join(folder_path, filename)
        image = Image.open(image_path)
        text = os.path.splitext(filename)[0]
        score = get_clip_score(image, text)
        total_score += score
        count += 1
        if count % 1000 == 0:
            print("current num:", count,f"current AVG CLIP Score: {total_score/count}")

print(f"AVG CLIP Score: {total_score/count}") 


### load coco30k_caption

In [8]:
coco_f = open('/data/dataset/coco2014-val/annotations/captions_val2014.json')
coco_annotations = json.load(coco_f)
captions = []
for annotation in coco_annotations['annotations']:
    caption = annotation['caption']
    captions.append(caption)
coco_f.close()
random.seed(2024)
captions_30k = random.choices(captions, k=30000)
print(len(captions_30k),captions[0],captions[1])

30000 A bicycle replica with a clock as the front wheel. A black Honda motorcycle parked in front of a garage.


### load vaild_caption

In [None]:
# Specify the path to your JSONL file
jsonl_file_path = '/data/20231212/SwiftBrush_reproduce_se_parallel/JourneyDB/valid/valid_prompt.jsonl'
# Initialize an empty list to store the prompts
prompts_list = []
with open(jsonl_file_path) as f:
    d = json.load(f)
    for line in d:
        prompts_list.append(line)
        
random.seed(2024)
captions_30k = random.choices(prompts_list, k=30000)
print(len(captions_30k),captions_30k[0],captions_30k[1])

### ours

In [9]:
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    image = generate_single_image(network=(vae, tokenizer, text_encoder, unet, scheduler),prompt=caption,seed=2024,num_inference_steps=ours_steps)
    score = get_clip_score(image, caption)
    try:
        save_name = str(count)+".jpg"
        image.save(os.path.join(ours_coco_path,save_name))
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

current num: 1000 current avg clip score: 0.2615880126953125
current num: 2000 current avg clip score: 0.2610256958007813
current num: 3000 current avg clip score: 0.260946533203125
current num: 4000 current avg clip score: 0.26053231811523436
current num: 5000 current avg clip score: 0.2607731689453125
current num: 6000 current avg clip score: 0.2605646769205729
current num: 7000 current avg clip score: 0.26062793840680804
current num: 8000 current avg clip score: 0.26105057525634767
current num: 9000 current avg clip score: 0.2608954467773438
current num: 10000 current avg clip score: 0.2608013977050781
current num: 11000 current avg clip score: 0.2607430586381392
current num: 12000 current avg clip score: 0.2606460622151693
current num: 13000 current avg clip score: 0.2606663771409255
current num: 14000 current avg clip score: 0.26059676252092634
current num: 15000 current avg clip score: 0.26063055419921877
current num: 16000 current avg clip score: 0.26057498550415037
current num:

### swift

In [None]:
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    imgs = gs.prompt_to_img_student((vae, tokenizer, text_encoder, unet), caption, seed=2024, alphas=alphas)
    imgs = imgs.permute(0, 2, 3, 1).cpu().numpy()
    imgs = (imgs * 255).round().astype('uint8')
    imgs = gs.image_grid(imgs, grid_size=(-1, 1))
    imgs = Image.fromarray(imgs)
    score = get_clip_score(imgs, caption)
    try:
        save_name = str(count)+".jpg"
        imgs.save(os.path.join(swift_coco_path,save_name))
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

### instalflow

In [None]:
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    image = instaflow_pipe(prompt=caption, num_inference_steps=1, guidance_scale=0.0).images[0]
    score = get_clip_score(image, caption)
    try:
        image.save("/home/liutao/workspace/data/instaflow_coco30k/"+caption+".jpg")
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

### sdxl_turbo

In [None]:
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    image = sdxl_turbo_pipe(prompt=caption, num_inference_steps=4, guidance_scale=0.0).images[0]
    score = get_clip_score(image, caption)
    try:
        image.save("/home/liutao/workspace/data/sdxl4_coco30k_test/"+caption+".jpg")
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

In [None]:
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    image = sdxl_turbo_pipe(prompt=caption, num_inference_steps=1, guidance_scale=0.0).images[0]
    score = get_clip_score(image, caption)
    try:
        image.save("/home/liutao/workspace/data/sdxl1_coco30k_test/"+caption+".jpg")
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

### lcm

In [None]:
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    image = lcm_pipe(prompt=caption, height=512, width=512, num_inference_steps=4, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil", truncation=True).images[0]
    score = get_clip_score(image, caption)
    print("B")
    try:
        image.save("/home/liutao/workspace/data/prompt_test_lcm4/"+text_to_hash(caption)+".jpg")
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 100 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

In [None]:
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    image = lcm_pipe(prompt=caption, num_inference_steps=1, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images[0]
    score = get_clip_score(image, caption)
    # try:
    #     image.save("/home/liutao/workspace/data/lcm_1_step_coco30K/"+caption+".jpg")
    # except:
    #     print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

In [None]:
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    image = lcm_pipe(prompt=caption, height=512, width=512, num_inference_steps=4, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images[0]
    score = get_clip_score(image, caption)
    try:
        image.save("/home/liutao/workspace/data/lcm_4_step_512_coco30K/"+caption+".jpg")
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

In [None]:
count = 0
total_score = 0
for case_number, caption in enumerate(captions_30k):
    image = lcm_pipe(prompt=caption, height=512, width=512, num_inference_steps=1, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images[0]
    score = get_clip_score(image, caption)
    try:
        image.save("/home/liutao/workspace/data/lcm_1_step_512_coco30K/"+caption+".jpg")
    except:
        print(caption)
    total_score += score
    count += 1
    if count % 1000 == 0:
        print("current num:",count,"current avg clip score:",total_score/count)
print(f"AVG CLIP Score: {total_score/count}")

## hpsv2 score

### ours

In [None]:
import hpsv2
# # Get benchmark prompts (<style> = all, anime, concept-art, paintings, photo)
all_prompts = hpsv2.benchmark_prompts('all') 
# Iterate over the benchmark prompts to generate images
for style, prompts in all_prompts.items():
    for idx, prompt in enumerate(prompts):
        image = generate_single_image(network=(vae, tokenizer, text_encoder, unet, scheduler),prompt=prompt,seed=2024,num_inference_steps=ours_steps)
        # TextToImageModel is the model you want to evaluate
        image.save(os.path.join(ours_path, style, f"{idx:05d}.jpg")) 

In [None]:
hpsv2.evaluate(ours_path) 

### swift

In [None]:
import hpsv2
all_prompts = hpsv2.benchmark_prompts('all') 
for style, prompts in all_prompts.items():
    for idx, prompt in enumerate(prompts):
        imgs = gs.prompt_to_img_student((vae, tokenizer, text_encoder, unet), prompt, seed=2024, alphas=alphas)
        imgs = imgs.permute(0, 2, 3, 1).cpu().numpy()
        imgs = (imgs * 255).round().astype('uint8')
        imgs = gs.image_grid(imgs, grid_size=(-1, 1))
        imgs = Image.fromarray(imgs)
        imgs.save(os.path.join(swift_hpsv2_path, style, f"{idx:05d}.jpg")) 

In [None]:
hpsv2.evaluate(swift_hpsv2_path) 

### sdxl_4

In [None]:
# Get benchmark prompts (<style> = all, anime, concept-art, paintings, photo)
all_prompts = hpsv2.benchmark_prompts('all') 
path = "/home/liutao/workspace/data/sdxl4_hpsv2"
# Iterate over the benchmark prompts to generate images
for style, prompts in all_prompts.items():
    for idx, prompt in enumerate(prompts):
        image = sdxl_turbo_pipe(prompt=prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
        image.save(os.path.join(path, style, f"{idx:05d}.jpg")) 

hpsv2.evaluate(path) 

### lcm4

In [None]:
# Get benchmark prompts (<style> = all, anime, concept-art, paintings, photo)
all_prompts = hpsv2.benchmark_prompts('all') 
path = "/home/liutao/workspace/data/lcm4_hpsv2"
for style, prompts in all_prompts.items():
    for idx, prompt in enumerate(prompts):
        image = lcm_pipe(prompt=prompt, width=512, height=512, num_inference_steps=4, guidance_scale=8.0, lcm_origin_steps=50, output_type="pil").images[0]
        image.save(os.path.join(path, style, f"{idx:05d}.jpg")) 

hpsv2.evaluate(path) 

## fid score