## Clip score eval

In [1]:
from torchmetrics.functional.multimodal import clip_score
from functools import partial
import torch
### dont move to another cell
clip_score_fn = partial(clip_score, model_name_or_path="openai/clip-vit-base-patch16")

def calculate_clip_score(images, prompts):
    images_int = (images * 255).astype("uint8")
    clip_score = clip_score_fn(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
    return round(float(clip_score), 4)

In [2]:
from datasets import load_dataset

dataset = load_dataset("HuggingFaceM4/COCO", split='test[0:100]')

tokens = dataset['sentences'][:100]

prompts = []
for token in tokens:
   prompts.append(token['raw'])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
import diffusers
diffusers.logging.set_verbosity_warning()

diffusers.logging.disable_progress_bar()
import datasets
datasets.disable_progress_bar()

# Compare Different Models

In [14]:
from time import time
prompt = prompts[1]

In [8]:
print(prompt)

Man riding a motor bike on a dirt road on the countryside.


### LDM

In [28]:
from diffusers import DiffusionPipeline
model_id = "CompVis/ldm-text2im-large-256"

# load model and scheduler
ldm = DiffusionPipeline.from_pretrained(model_id)

cur = time()
ldm_image = ldm([prompt], num_inference_steps=75, eta=0.3, guidance_scale=8).images
print("Generating time in seconds: ", time()-cur)

vqvae/diffusion_pytorch_model.safetensors not found
The config attributes {'timestep_values': None} were passed to DDIMScheduler, but are not expected and will be ignored. Please verify your scheduler_config.json configuration file.


  0%|          | 0/75 [00:00<?, ?it/s]

Generating time in seconds:  371.7733271121979


### LCM

In [20]:
from diffusers import UNet2DConditionModel, DiffusionPipeline, LCMScheduler
import torch

unet = UNet2DConditionModel.from_pretrained("latent-consistency/lcm-sdxl", torch_dtype=torch.float16, variant="fp16")
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16, variant="fp16")

pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")
cur = time()
lcm_image = pipe(prompt=prompt, num_inference_steps=45, guidance_scale=8.0 ).images
print("Generating time in seconds: ", time()-cur)

The config attributes {'skip_prk_steps': True} were passed to LCMScheduler, but are not expected and will be ignored. Please verify your scheduler_config.json configuration file.


  0%|          | 0/45 [00:00<?, ?it/s]

Generating time in seconds:  7.78581166267395


### LCM-LoRA

In [38]:
import torch
from diffusers import LCMScheduler, AutoPipelineForText2Image

model_id = "stabilityai/stable-diffusion-xl-base-1.0"
adapter_id = "latent-consistency/lcm-lora-sdxl"

sdxlpipe = AutoPipelineForText2Image.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16")
sdxlpipe.scheduler = LCMScheduler.from_config(sdxlpipe.scheduler.config)
sdxlpipe.to("cuda")

# load and fuse lcm lora
sdxlpipe.load_lora_weights(adapter_id)
sdxlpipe.fuse_lora()
cur = time()
lora_image = sdxlpipe(prompt=prompt, num_inference_steps=45, guidance_scale=8.0).images
print("Generating time in seconds: ", time()-cur)

The config attributes {'skip_prk_steps': True} were passed to LCMScheduler, but are not expected and will be ignored. Please verify your scheduler_config.json configuration file.


  0%|          | 0/45 [00:00<?, ?it/s]

Generating time in seconds:  13.801863431930542


### LDM-DPO

In [16]:
import torch
from diffusers import AutoPipelineForText2Image, DPMSolverMultistepScheduler
from diffusers.utils import make_image_grid

sdxl_dpo_lora_pipe = AutoPipelineForText2Image.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float16,
    use_safetensors=True,
    variant="fp16",
)

sdxl_dpo_lora_pipe.safety_checker = None
sdxl_dpo_lora_pipe.set_progress_bar_config(disable=True)


sdxl_dpo_lora_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
    sdxl_dpo_lora_pipe.scheduler.config,
    use_karras_sigmas=True,
    algorithm_type="sde-dpmsolver++"
)

sdxl_dpo_lora_pipe.to("cuda");

seed = 12341234123
negative_prompt = "3d render, cartoon, drawing, art, low light, blur, pixelated, low resolution, black and white"
num_inference_steps = 40 # Keep
height = 1024
width = height

sdxl_dpo_lora_pipe.unload_lora_weights()
sdxl_dpo_lora_pipe.load_lora_weights(
    "radames/sdxl-DPO-LoRA",
    adapter_name="sdxl-dpo-lora",
)
# sdxl_dpo_lora_pipe.set_adapters(["sdxl-dpo-lora"], adapter_weights=[0.9])
generator = torch.Generator().manual_seed(seed)
cur = time()
dpo_image = sdxl_dpo_lora_pipe(disable_tqdm = True,prompt=prompt,num_images_per_prompt=1, num_inference_steps=45, guidance_scale=8.0).images
print("Generating time in seconds: ", time()-cur)

You are using `unload_lora_weights` to disable and unload lora weights. If you want to iteratively enable and disable adapter weights,you can use `pipe.enable_lora()` or `pipe.disable_lora()`. After installing the latest version of PEFT.


Generating time in seconds:  15.30357027053833


### Evaluation

In [29]:
# works with (, output_type='np')
import numpy as np
v14Image = ldm_image
clip_score = calculate_clip_score(np.array(v14Image), prompt)
print("CLIP Score: ", clip_score)

CLIP Score:  25.3381


In [30]:
from PIL import Image
import io
import os
import json

from warnings import filterwarnings


# os.environ["CUDA_VISIBLE_DEVICES"] = "0"    # choose GPU if you are on a multi GPU server
import numpy as np
import torch
import pytorch_lightning as pl
import torch.nn as nn
from torchvision import datasets, transforms
import tqdm

from os.path import join
from datasets import load_dataset
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import json

import clip


from PIL import Image, ImageFile


#####  This script will predict the aesthetic score for this image file:

img_path = "test.jpg"





# if you changed the MLP architecture during training, change it also here:
class MLP(pl.LightningModule):
    def __init__(self, input_size, xcol='emb', ycol='avg_rating'):
        super().__init__()
        self.input_size = input_size
        self.xcol = xcol
        self.ycol = ycol
        self.layers = nn.Sequential(
            nn.Linear(self.input_size, 1024),
            #nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 128),
            #nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            #nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(64, 16),
            #nn.ReLU(),

            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.layers(x)

    def training_step(self, batch, batch_idx):
            x = batch[self.xcol]
            y = batch[self.ycol].reshape(-1, 1)
            x_hat = self.layers(x)
            loss = F.mse_loss(x_hat, y)
            return loss

    def validation_step(self, batch, batch_idx):
        x = batch[self.xcol]
        y = batch[self.ycol].reshape(-1, 1)
        x_hat = self.layers(x)
        loss = F.mse_loss(x_hat, y)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

def normalized(a, axis=-1, order=2):
    import numpy as np  # pylint: disable=import-outside-toplevel

    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis)


In [31]:
aestheticPredictorModel = MLP(768)  # CLIP embedding dim is 768 for CLIP ViT L 14

s = torch.load("sac+logos+ava1-l14-linearMSE.pth")   # load the model you trained previously or the model available in this repo

aestheticPredictorModel.load_state_dict(s)

aestheticPredictorModel.to("cuda")
aestheticPredictorModel.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model2, preprocess = clip.load("ViT-L/14", device=device)  #RN50x64

In [32]:
# works with output type not 'np'
import ImageReward as reward
import clip
from PIL import Image
imageReward = reward.load("ImageReward-v1.0")
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(v14Image[0]).unsqueeze(0).to(device)
text = clip.tokenize(prompt).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

    hps = image_features @ text_features.T
    print("HPS: ",hps)

    v14_image_reward = imageReward.score(prompt, v14Image)
    image_features = model2.encode_image(image)

print("image reward: ", v14_image_reward)
im_emb_arr = normalized(image_features.cpu().detach().numpy() )
v14_aesthetic_score = aestheticPredictorModel(torch.from_numpy(im_emb_arr).to(device).type(torch.cuda.FloatTensor))
print("aesthetic score: ", v14_aesthetic_score)

load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
HPS:  tensor([[0.3140]], device='cuda:0', dtype=torch.float16)
image reward:  1.1446330547332764
aesthetic score:  tensor([[5.2901]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [33]:
v14Image[0].save("ldm_75.png")

## Latent Diffusion Model

https://huggingface.co/CompVis/ldm-text2im-large-256

In [4]:
from diffusers import DiffusionPipeline
model_id = "CompVis/ldm-text2im-large-256"

# load model and scheduler
ldm = DiffusionPipeline.from_pretrained(model_id)

# # run pipeline in inference (sample random noise and denoise)
# prompt = "A woman wearing a net on her head cutting a cake"
# images_ldm = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6).images

# # save images
# for idx, image in enumerate(images_ldm):
#     image.save(f"ldm-{idx}.png")


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
bert/model.safetensors not found
The config attributes {'timestep_values': None} were passed to DDIMScheduler, but are not expected and will be ignored. Please verify your scheduler_config.json configuration file.


In [5]:
total_clip_score = 0
for prompt in prompts:
  num_inference_steps = 75

  # image = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0, output_type='np' ).images
  image = ldm([prompt], num_inference_steps=num_inference_steps, eta=0.3, guidance_scale=6, output_type='np').images

  img_clip_score = calculate_clip_score(image, prompt)
  total_clip_score += img_clip_score
  print(img_clip_score)

# clip score:
total_clip_score = total_clip_score/100
print(total_clip_score)

  0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Consistency Model

https://huggingface.co/dg845/consistency-model-pipelines

In [4]:
import torch

from diffusers import ConsistencyModelPipeline

device = "cuda"
# Load the cd_imagenet64_l2 checkpoint.
model_id_or_path = "dg845/consistency-model-pipelines"
pipe = ConsistencyModelPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
pipe.to(device)

# Onestep Sampling
image = pipe(num_inference_steps=1).images[0]

  torch.utils._pytree._register_pytree_node(
  deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
total_clip_score = 0
for prompt in prompts:
  num_inference_steps = 45

  image = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0, output_type='np' ).images
  # image = ldm([prompt], num_inference_steps=num_inference_steps, eta=0.3, guidance_scale=6, output_type='np').images

  img_clip_score = calculate_clip_score(image, prompt)
  total_clip_score += img_clip_score
  print(img_clip_score)

# clip score:
total_clip_score = total_clip_score/100
print(total_clip_score)

TypeError: __call__() got an unexpected keyword argument 'prompt'

In [None]:
import torch
import clip
from PIL import Image

num_inference_steps = 45
total_hps = 0

for i in range(len(prompts)):
    prompt = prompts[i]
    
    v14Image = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0).images[0]
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    
    image = preprocess(v14Image).unsqueeze(0).to(device)
    text = clip.tokenize(prompt).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
    
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
        hps = image_features @ text_features.T
        print(hps)
    
    total_hps += hps

print(total_hps/100)

## Latent Consistency Model

In [None]:
# !git clone https://github.com/huggingface/transformers.git
# %cd transformers
# !pip install -e .

https://huggingface.co/latent-consistency/lcm-sdxl

In [4]:
from diffusers import UNet2DConditionModel, DiffusionPipeline, LCMScheduler
import torch

unet = UNet2DConditionModel.from_pretrained("latent-consistency/lcm-sdxl", torch_dtype=torch.float16, variant="fp16")
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16, variant="fp16")

pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")

# prompt = "a close-up picture of an old man standing in the rain"

# image = pipe(prompt, num_inference_steps=4, guidance_scale=8.0).images[0]

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
The config attributes {'skip_prk_steps': True} were passed to LCMScheduler, but are not expected and will be ignored. Please verify your scheduler_config.json configuration file.


StableDiffusionXLPipeline {
  "_class_name": "StableDiffusionXLPipeline",
  "_diffusers_version": "0.26.3",
  "_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0",
  "feature_extractor": [
    null,
    null
  ],
  "force_zeros_for_empty_prompt": true,
  "image_encoder": [
    null,
    null
  ],
  "scheduler": [
    "diffusers",
    "LCMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "text_encoder_2": [
    "transformers",
    "CLIPTextModelWithProjection"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "tokenizer_2": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [6]:
total_clip_score = 0
for prompt in prompts:
  num_inference_steps = 45

  image = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0, output_type='np' ).images
  # image = ldm([prompt], num_inference_steps=num_inference_steps, eta=0.3, guidance_scale=6, output_type='np').images

  img_clip_score = calculate_clip_score(image, prompt)
  total_clip_score += img_clip_score
  print(img_clip_score)

# clip score:
total_clip_score = total_clip_score/100
print(total_clip_score)

  0%|          | 0/45 [00:00<?, ?it/s]

31.9215


  0%|          | 0/45 [00:00<?, ?it/s]

33.19


  0%|          | 0/45 [00:00<?, ?it/s]

27.6881


  0%|          | 0/45 [00:00<?, ?it/s]

30.7924


  0%|          | 0/45 [00:00<?, ?it/s]

33.7725


  0%|          | 0/45 [00:00<?, ?it/s]

32.8635


  0%|          | 0/45 [00:00<?, ?it/s]

31.7462


  0%|          | 0/45 [00:00<?, ?it/s]

31.5683


  0%|          | 0/45 [00:00<?, ?it/s]

35.084


  0%|          | 0/45 [00:00<?, ?it/s]

34.18


  0%|          | 0/45 [00:00<?, ?it/s]

35.1797


  0%|          | 0/45 [00:00<?, ?it/s]

31.5178


  0%|          | 0/45 [00:00<?, ?it/s]

33.303


  0%|          | 0/45 [00:00<?, ?it/s]

32.6073


  0%|          | 0/45 [00:00<?, ?it/s]

34.9461


  0%|          | 0/45 [00:00<?, ?it/s]

31.0084


  0%|          | 0/45 [00:00<?, ?it/s]

30.9671


  0%|          | 0/45 [00:00<?, ?it/s]

31.2763


  0%|          | 0/45 [00:00<?, ?it/s]

29.6859


  0%|          | 0/45 [00:00<?, ?it/s]

26.3598


  0%|          | 0/45 [00:00<?, ?it/s]

33.8572


  0%|          | 0/45 [00:00<?, ?it/s]

29.9998


  0%|          | 0/45 [00:00<?, ?it/s]

30.1192


  0%|          | 0/45 [00:00<?, ?it/s]

31.3145


  0%|          | 0/45 [00:00<?, ?it/s]

34.5031


  0%|          | 0/45 [00:00<?, ?it/s]

33.5505


  0%|          | 0/45 [00:00<?, ?it/s]

24.1632


  0%|          | 0/45 [00:00<?, ?it/s]

32.5233


  0%|          | 0/45 [00:00<?, ?it/s]

30.9015


  0%|          | 0/45 [00:00<?, ?it/s]

32.5124


  0%|          | 0/45 [00:00<?, ?it/s]

29.7767


  0%|          | 0/45 [00:00<?, ?it/s]

29.0806


  0%|          | 0/45 [00:00<?, ?it/s]

28.2494


  0%|          | 0/45 [00:00<?, ?it/s]

28.5056


  0%|          | 0/45 [00:00<?, ?it/s]

27.293


  0%|          | 0/45 [00:00<?, ?it/s]

31.0074


  0%|          | 0/45 [00:00<?, ?it/s]

33.706


  0%|          | 0/45 [00:00<?, ?it/s]

30.5113


  0%|          | 0/45 [00:00<?, ?it/s]

31.1367


  0%|          | 0/45 [00:00<?, ?it/s]

29.6724


  0%|          | 0/45 [00:00<?, ?it/s]

30.7422


  0%|          | 0/45 [00:00<?, ?it/s]

32.4404


  0%|          | 0/45 [00:00<?, ?it/s]

34.7519


  0%|          | 0/45 [00:00<?, ?it/s]

29.7968


  0%|          | 0/45 [00:00<?, ?it/s]

30.094


  0%|          | 0/45 [00:00<?, ?it/s]

26.2508


  0%|          | 0/45 [00:00<?, ?it/s]

30.2737


  0%|          | 0/45 [00:00<?, ?it/s]

29.7505


  0%|          | 0/45 [00:00<?, ?it/s]

29.3609


  0%|          | 0/45 [00:00<?, ?it/s]

29.6801


  0%|          | 0/45 [00:00<?, ?it/s]

28.6414


  0%|          | 0/45 [00:00<?, ?it/s]

31.864


  0%|          | 0/45 [00:00<?, ?it/s]

29.065


  0%|          | 0/45 [00:00<?, ?it/s]

30.7703


  0%|          | 0/45 [00:00<?, ?it/s]

30.7466


  0%|          | 0/45 [00:00<?, ?it/s]

28.3079


  0%|          | 0/45 [00:00<?, ?it/s]

26.8763


  0%|          | 0/45 [00:00<?, ?it/s]

27.2683


  0%|          | 0/45 [00:00<?, ?it/s]

25.7732


  0%|          | 0/45 [00:00<?, ?it/s]

27.7829


  0%|          | 0/45 [00:00<?, ?it/s]

28.005


  0%|          | 0/45 [00:00<?, ?it/s]

29.127


  0%|          | 0/45 [00:00<?, ?it/s]

25.754


  0%|          | 0/45 [00:00<?, ?it/s]

28.9783


  0%|          | 0/45 [00:00<?, ?it/s]

30.7962


  0%|          | 0/45 [00:00<?, ?it/s]

31.8782


  0%|          | 0/45 [00:00<?, ?it/s]

27.3048


  0%|          | 0/45 [00:00<?, ?it/s]

29.3329


  0%|          | 0/45 [00:00<?, ?it/s]

34.3961


  0%|          | 0/45 [00:00<?, ?it/s]

31.2864


  0%|          | 0/45 [00:00<?, ?it/s]

29.9814


  0%|          | 0/45 [00:00<?, ?it/s]

26.5694


  0%|          | 0/45 [00:00<?, ?it/s]

28.4127


  0%|          | 0/45 [00:00<?, ?it/s]

31.1059


  0%|          | 0/45 [00:00<?, ?it/s]

30.3824


  0%|          | 0/45 [00:00<?, ?it/s]

33.688


  0%|          | 0/45 [00:00<?, ?it/s]

32.7816


  0%|          | 0/45 [00:00<?, ?it/s]

35.2279


  0%|          | 0/45 [00:00<?, ?it/s]

34.953


  0%|          | 0/45 [00:00<?, ?it/s]

33.1133


  0%|          | 0/45 [00:00<?, ?it/s]

34.0347


  0%|          | 0/45 [00:00<?, ?it/s]

29.5631


  0%|          | 0/45 [00:00<?, ?it/s]

35.2932


  0%|          | 0/45 [00:00<?, ?it/s]

33.3295


  0%|          | 0/45 [00:00<?, ?it/s]

32.7473


  0%|          | 0/45 [00:00<?, ?it/s]

32.2369


  0%|          | 0/45 [00:00<?, ?it/s]

29.4413


  0%|          | 0/45 [00:00<?, ?it/s]

36.528


  0%|          | 0/45 [00:00<?, ?it/s]

33.4031


  0%|          | 0/45 [00:00<?, ?it/s]

39.4351


  0%|          | 0/45 [00:00<?, ?it/s]

26.5637


  0%|          | 0/45 [00:00<?, ?it/s]

29.1427


  0%|          | 0/45 [00:00<?, ?it/s]

25.0895


  0%|          | 0/45 [00:00<?, ?it/s]

26.9789


  0%|          | 0/45 [00:00<?, ?it/s]

28.0774


  0%|          | 0/45 [00:00<?, ?it/s]

31.5245


  0%|          | 0/45 [00:00<?, ?it/s]

30.5553


  0%|          | 0/45 [00:00<?, ?it/s]

33.5582


  0%|          | 0/45 [00:00<?, ?it/s]

27.1192


  0%|          | 0/45 [00:00<?, ?it/s]

29.7487
30.797236999999985


In [63]:
import torch
import clip
from PIL import Image

num_inference_steps = 45
total_hps = 0

for i in range(len(prompts)):
    prompt = prompts[i]
    
    v14Image = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0).images[0]
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    
    image = preprocess(v14Image).unsqueeze(0).to(device)
    text = clip.tokenize(prompt).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
    
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
        hps = image_features @ text_features.T
        print(hps)
    
    total_hps += hps

print(total_hps/100)

  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3633]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3296]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2891]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3206]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3557]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3481]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3528]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3479]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3555]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3113]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3394]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3435]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3511]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2639]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3682]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2837]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3213]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3293]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2974]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3135]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3218]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2820]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2976]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2966]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3279]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3174]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2505]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2922]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3198]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2891]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2654]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3027]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2827]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3020]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2737]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3157]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3362]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2937]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3093]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2949]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2900]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3025]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3179]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2964]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2891]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2898]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2993]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2974]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2498]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2964]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3213]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3071]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3057]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3220]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3115]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2737]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2747]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2837]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2954]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2668]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3516]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3198]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3052]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2959]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3108]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3198]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2876]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3281]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3181]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2837]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3274]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2798]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2769]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2937]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2944]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3472]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3479]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3354]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3367]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3167]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3096]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2856]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3662]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3640]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3264]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3511]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3606]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2788]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3108]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3784]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2856]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3223]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2681]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2937]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2656]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3206]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3350]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2627]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2871]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2927]], device='cuda:0', dtype=torch.float16)
tensor([[0.3096]], device='cuda:0', dtype=torch.float16)


In [7]:

num_inference_steps = 45
total_image_reward = 0
total_aesthetic_score = 0


import torch
import ImageReward as reward

pipe.safety_checker = None

for i in range(len(prompts)):
    prompt = prompts[i]
    
    
    v14Image = pipe(disable_tqdm = True,prompt=prompt,num_images_per_prompt=1, num_inference_steps=num_inference_steps, guidance_scale=8.0).images[0]
    pipe.set_progress_bar_config(disable=True)
    
    image = preprocess(v14Image).unsqueeze(0).to(device)
    imageReward = reward.load("ImageReward-v1.0")
    with torch.no_grad():
        v14_image_reward = imageReward.score(prompt, v14Image)
        image_features = model2.encode_image(image)
    
    total_image_reward += v14_image_reward
    print("image reward: ", v14_image_reward)
    im_emb_arr = normalized(image_features.cpu().detach().numpy() )
    v14_aesthetic_score = aestheticPredictorModel(torch.from_numpy(im_emb_arr).to(device).type(torch.cuda.FloatTensor))
    total_aesthetic_score += v14_aesthetic_score
    print("aesthetic score: ", v14_aesthetic_score)


print("==================total=================")
print("average image reward: ")
print(total_image_reward/100)
print("average aesthetic score: ")
print(total_aesthetic_score/100)



  0%|          | 0/45 [00:00<?, ?it/s]

load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  1.286228895187378
aesthetic score:  tensor([[5.8616]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  1.5407110452651978
aesthetic score:  tensor([[5.6564]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  1.1631336212158203
aesthetic score:  tensor([[5.4862]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  0.2987986207008362
aesthetic score:  tensor([[6.2472]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  1.787033200263977
aesthetic score:  tensor([[

## LCM - LORA

https://huggingface.co/latent-consistency/lcm-lora-sdv1-5

In [6]:
# https://huggingface.co/latent-consistency/lcm-lora-sdxl

import torch
from diffusers import LCMScheduler, AutoPipelineForText2Image

model_id = "stabilityai/stable-diffusion-xl-base-1.0"
adapter_id = "latent-consistency/lcm-lora-sdxl"

sdxlpipe = AutoPipelineForText2Image.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16")
sdxlpipe.scheduler = LCMScheduler.from_config(sdxlpipe.scheduler.config)
sdxlpipe.to("cuda")

# load and fuse lcm lora
sdxlpipe.load_lora_weights(adapter_id)
sdxlpipe.fuse_lora()

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
The config attributes {'skip_prk_steps': True} were passed to LCMScheduler, but are not expected and will be ignored. Please verify your scheduler_config.json configuration file.
  deprecate("fuse_text_encoder_lora", "0.27", LORA_DEPRECATION_MESSAGE)


In [5]:
import torch
import clip
from PIL import Image

num_inference_steps = 45
total_hps = 0

for i in range(len(prompts)):
    prompt = prompts[i]
    
    v14Image = sdxlpipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0).images[0]
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    
    image = preprocess(v14Image).unsqueeze(0).to(device)
    text = clip.tokenize(prompt).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
    
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
        hps = image_features @ text_features.T
        print(hps)
    
    total_hps += hps

print(total_hps/100)

  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2822]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2300]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2817]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3564]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3550]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2576]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2756]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2446]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3179]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2773]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3188]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3250]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3054]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2910]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2903]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2832]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2444]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2454]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2678]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3123]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2720]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3115]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2793]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2993]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2957]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2776]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3083]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2776]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2922]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3057]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2048]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2424]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2625]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2432]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2524]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2323]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2712]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2622]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2423]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.1917]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2964]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2751]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3323]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2494]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2832]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.1996]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2451]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2717]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3000]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2781]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2529]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2979]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2654]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2976]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2727]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2524]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2498]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2930]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2991]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2419]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2581]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3184]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2883]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2708]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2361]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2430]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2664]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2795]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2460]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2476]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3088]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2766]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2598]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2822]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3037]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2598]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3232]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2788]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3606]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2852]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2729]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.1804]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2686]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.1812]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2749]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3403]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3105]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3047]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2303]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.3362]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2517]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2932]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2646]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2255]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2795]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2683]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2644]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2113]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2374]], device='cuda:0', dtype=torch.float16)


  0%|          | 0/45 [00:00<?, ?it/s]

tensor([[0.2473]], device='cuda:0', dtype=torch.float16)
tensor([[0.2742]], device='cuda:0', dtype=torch.float16)


In [8]:

num_inference_steps = 45
total_image_reward = 0
total_aesthetic_score = 0


import torch
import ImageReward as reward

sdxlpipe.safety_checker = None

for i in range(len(prompts)):
    prompt = prompts[i]
    
    
    v14Image = sdxlpipe(disable_tqdm = True,prompt=prompt,num_images_per_prompt=1, num_inference_steps=num_inference_steps, guidance_scale=8.0).images[0]
    sdxlpipe.set_progress_bar_config(disable=True)
    
    image = preprocess(v14Image).unsqueeze(0).to(device)
    imageReward = reward.load("ImageReward-v1.0")
    with torch.no_grad():
        v14_image_reward = imageReward.score(prompt, v14Image)
        image_features = model2.encode_image(image)
    
    total_image_reward += v14_image_reward
    print("image reward: ", v14_image_reward)
    im_emb_arr = normalized(image_features.cpu().detach().numpy() )
    v14_aesthetic_score = aestheticPredictorModel(torch.from_numpy(im_emb_arr).to(device).type(torch.cuda.FloatTensor))
    total_aesthetic_score += v14_aesthetic_score
    print("aesthetic score: ", v14_aesthetic_score)


print("==================total=================")
print("average image reward: ")
print(total_image_reward/100)
print("average aesthetic score: ")
print(total_aesthetic_score/100)



  0%|          | 0/45 [00:00<?, ?it/s]

load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  -1.1388907432556152
aesthetic score:  tensor([[5.1207]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  0.2732652425765991
aesthetic score:  tensor([[5.3916]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  0.21898427605628967
aesthetic score:  tensor([[5.0814]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  -1.1769771575927734
aesthetic score:  tensor([[5.4639]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  0.8105146884918213
aesthetic score:  tens

## Image Reward

In [8]:
!pip install image-reward

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting image-reward
  Downloading image_reward-1.5-py3-none-any.whl.metadata (12 kB)
Collecting timm==0.6.13 (from image-reward)
  Downloading timm-0.6.13-py3-none-any.whl.metadata (38 kB)
Collecting fairscale==0.4.13 (from image-reward)
  Downloading fairscale-0.4.13.tar.gz (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.3/266.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Downloading image_reward-1.5-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading timm-0.6.13-py3-none-any.whl (549 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m549.1/549.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m00:01[

In [10]:

num_inference_steps = 1
# higher inf step == higher clip score?


v14Image_total_image_reward = 0

import os
import torch
import ImageReward as reward


for i in range(len(prompts)):
  prompt = prompts[i]


  v14Image = pipe(disable_tqdm = True,prompt=prompt,num_images_per_prompt=1, num_inference_steps=num_inference_steps, guidance_scale=8.0).images[0]
  sd_pipeline.set_progress_bar_config(disable=True)

  imageReward = reward.load("ImageReward-v1.0")
  with torch.no_grad():
    v14_image_reward = imageReward.score(prompt, v14Image)

  v14Image_total_image_reward += v14_image_reward
  if i % 100 == 0:
    print("-------iteration ", i)
    print('image reward for V1-4:', v14_image_reward)
    print('avg image reward for V1-4:', v14Image_total_image_reward/(i+1))

NameError: name 'pipe' is not defined

## Aesthetic Predictor

In [16]:
!pip install pytorch_lightning

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pytorch_lightning
  Downloading pytorch_lightning-2.2.1-py3-none-any.whl.metadata (21 kB)
Downloading pytorch_lightning-2.2.1-py3-none-any.whl (801 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m801.6/801.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pytorch_lightning
Successfully installed pytorch_lightning-2.2.1


In [4]:
from PIL import Image
import io
import os
import json

from warnings import filterwarnings


# os.environ["CUDA_VISIBLE_DEVICES"] = "0"    # choose GPU if you are on a multi GPU server
import numpy as np
import torch
import pytorch_lightning as pl
import torch.nn as nn
from torchvision import datasets, transforms
import tqdm

from os.path import join
from datasets import load_dataset
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import json

import clip


from PIL import Image, ImageFile


#####  This script will predict the aesthetic score for this image file:

img_path = "test.jpg"





# if you changed the MLP architecture during training, change it also here:
class MLP(pl.LightningModule):
    def __init__(self, input_size, xcol='emb', ycol='avg_rating'):
        super().__init__()
        self.input_size = input_size
        self.xcol = xcol
        self.ycol = ycol
        self.layers = nn.Sequential(
            nn.Linear(self.input_size, 1024),
            #nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 128),
            #nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            #nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(64, 16),
            #nn.ReLU(),

            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.layers(x)

    def training_step(self, batch, batch_idx):
            x = batch[self.xcol]
            y = batch[self.ycol].reshape(-1, 1)
            x_hat = self.layers(x)
            loss = F.mse_loss(x_hat, y)
            return loss

    def validation_step(self, batch, batch_idx):
        x = batch[self.xcol]
        y = batch[self.ycol].reshape(-1, 1)
        x_hat = self.layers(x)
        loss = F.mse_loss(x_hat, y)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

def normalized(a, axis=-1, order=2):
    import numpy as np  # pylint: disable=import-outside-toplevel

    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis)


In [5]:
aestheticPredictorModel = MLP(768)  # CLIP embedding dim is 768 for CLIP ViT L 14

s = torch.load("sac+logos+ava1-l14-linearMSE.pth")   # load the model you trained previously or the model available in this repo

aestheticPredictorModel.load_state_dict(s)

aestheticPredictorModel.to("cuda")
aestheticPredictorModel.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model2, preprocess = clip.load("ViT-L/14", device=device)  #RN50x64

In [None]:

sd_pipeline.safety_checker = None

num_inference_steps = 70
# higher inf step == higher clip score?


v14Image_total_aesthetic_score = 0

import os
import torch



for i in range(len(prompts)):
  prompt = prompts[i]

  v14Image = sd_pipeline(disable_tqdm = True,prompt=prompt,num_images_per_prompt=1, num_inference_steps=num_inference_steps, guidance_scale=8.0).images[0]
  sd_pipeline.set_progress_bar_config(disable=True)
  image = preprocess(v14Image).unsqueeze(0).to(device)

  with torch.no_grad():
    image_features = model2.encode_image(image)


  im_emb_arr = normalized(image_features.cpu().detach().numpy() )

  v14_aesthetic_score = aestheticPredictorModel(torch.from_numpy(im_emb_arr).to(device).type(torch.cuda.FloatTensor))


  v14Image_total_aesthetic_score += v14_aesthetic_score
  if i % 100 == 0:
    print("-------iteration ", i)
    print('aesthetic score for V1-4:', v14_aesthetic_score)
    print('avg aesthetic score for V1-4:', v14Image_total_aesthetic_score/(i+1))


## Human Preference

In [6]:
# !pip install ftfy regex tqdm
# !pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Using cached ftfy-6.1.3-py3-none-any.whl.metadata (6.2 kB)
Using cached ftfy-6.1.3-py3-none-any.whl (53 kB)
Installing collected packages: ftfy
Successfully installed ftfy-6.1.3
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-uz37rfiw
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-uz37rfiw
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25ldone
Collecting torchvision (from clip==1.0)
  Using cached torchvision-0.17.1-cp39-cp39-manylinux1_x86_64.whl.metadata (6.6 kB)
Using cached torchvision-0.17.1-cp39-cp39-manylinux1_x86_64.whl (6.9 MB)
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25ldone
[?25h  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369497 sha256=db61f746ba5dae8eae2c998ebc528a714a4

In [65]:
import torch
import clip
from PIL import Image

num_inference_steps = 45
total_hps = 0

for i in range(len(prompts)):
    prompt = prompts[i]
    
    v14Image = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0).images[0]
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    
    image = preprocess(v14Image).unsqueeze(0).to(device)
    text = clip.tokenize(prompt).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
    
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
        hps = image_features @ text_features.T
        print(hps)
    
    total_hps += hps

print(total_hps/100)

## Clipscore evaluation

In [None]:
total_clip_score = 0
for prompt in prompts:
  num_inference_steps = 75

  image = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0, output_type='np' ).images

  img_clip_score = calculate_clip_score(image, prompt)
  total_clip_score += img_clip_score
  print(img_clip_score)

# clip score:
total_clip_score = total_clip_score/100
print(total_clip_score)

## DPO

In [9]:
import torch
from diffusers import AutoPipelineForText2Image, DPMSolverMultistepScheduler
from diffusers.utils import make_image_grid

sdxl_dpo_lora_pipe = AutoPipelineForText2Image.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float16,
    use_safetensors=True,
    variant="fp16",
)

sdxl_dpo_lora_pipe.safety_checker = None
sdxl_dpo_lora_pipe.set_progress_bar_config(disable=True)


sdxl_dpo_lora_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
    sdxl_dpo_lora_pipe.scheduler.config,
    use_karras_sigmas=True,
    algorithm_type="sde-dpmsolver++"
)

sdxl_dpo_lora_pipe.to("cuda");

seed = 12341234123
prompt = "professional portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography"
negative_prompt = "3d render, cartoon, drawing, art, low light, blur, pixelated, low resolution, black and white"
num_inference_steps = 40 # Keep
height = 1024
width = height
guidance_scale = 7.5

sdxl_dpo_lora_pipe.unload_lora_weights()
sdxl_dpo_lora_pipe.load_lora_weights(
    "radames/sdxl-DPO-LoRA",
    adapter_name="sdxl-dpo-lora",
)
# sdxl_dpo_lora_pipe.set_adapters(["sdxl-dpo-lora"], adapter_weights=[0.9])
generator = torch.Generator().manual_seed(seed)

You are using `unload_lora_weights` to disable and unload lora weights. If you want to iteratively enable and disable adapter weights,you can use `pipe.enable_lora()` or `pipe.disable_lora()`. After installing the latest version of PEFT.


In [5]:
total_clip_score = 0
for prompt in prompts:
  num_inference_steps = 75

  image = sdxl_dpo_lora_pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0, output_type='np' ).images

  img_clip_score = calculate_clip_score(image, prompt)
  total_clip_score += img_clip_score
  print(img_clip_score)

# clip score:
total_clip_score = total_clip_score/100
print(total_clip_score)

Unused or unrecognized kwargs: padding.


32.5879


Unused or unrecognized kwargs: padding.


31.784


Unused or unrecognized kwargs: padding.


30.3304


Unused or unrecognized kwargs: padding.


31.8873


Unused or unrecognized kwargs: padding.


36.3989


Unused or unrecognized kwargs: padding.


31.0711


Unused or unrecognized kwargs: padding.


32.718


Unused or unrecognized kwargs: padding.


31.9578


Unused or unrecognized kwargs: padding.


31.8031


Unused or unrecognized kwargs: padding.


32.5143


Unused or unrecognized kwargs: padding.


32.7143


Unused or unrecognized kwargs: padding.


32.3364


Unused or unrecognized kwargs: padding.


34.1734


Unused or unrecognized kwargs: padding.


30.2036


Unused or unrecognized kwargs: padding.


35.4061


Unused or unrecognized kwargs: padding.


29.208


Unused or unrecognized kwargs: padding.


34.4529


Unused or unrecognized kwargs: padding.


31.1332


Unused or unrecognized kwargs: padding.


31.4616


Unused or unrecognized kwargs: padding.


34.0074


Unused or unrecognized kwargs: padding.


36.3081


Unused or unrecognized kwargs: padding.


33.5866


Unused or unrecognized kwargs: padding.


32.5685


Unused or unrecognized kwargs: padding.


33.4973


Unused or unrecognized kwargs: padding.


33.9481


Unused or unrecognized kwargs: padding.


32.4823


Unused or unrecognized kwargs: padding.


29.6001


Unused or unrecognized kwargs: padding.


33.2653


Unused or unrecognized kwargs: padding.


31.7234


Unused or unrecognized kwargs: padding.


32.969


Unused or unrecognized kwargs: padding.


32.0861


Unused or unrecognized kwargs: padding.


31.1955


Unused or unrecognized kwargs: padding.


29.8531


Unused or unrecognized kwargs: padding.


29.7644


Unused or unrecognized kwargs: padding.


29.0871


Unused or unrecognized kwargs: padding.


29.5865


Unused or unrecognized kwargs: padding.


32.9337


Unused or unrecognized kwargs: padding.


29.6747


Unused or unrecognized kwargs: padding.


29.1009


Unused or unrecognized kwargs: padding.


29.8807


Unused or unrecognized kwargs: padding.


31.6919


Unused or unrecognized kwargs: padding.


30.1104


Unused or unrecognized kwargs: padding.


36.3283


Unused or unrecognized kwargs: padding.


29.4805


Unused or unrecognized kwargs: padding.


30.5469


Unused or unrecognized kwargs: padding.


27.2548


Unused or unrecognized kwargs: padding.


30.2223


Unused or unrecognized kwargs: padding.


28.3078


Unused or unrecognized kwargs: padding.


28.4038


Unused or unrecognized kwargs: padding.


28.8244


Unused or unrecognized kwargs: padding.


29.9853


Unused or unrecognized kwargs: padding.


31.0451


Unused or unrecognized kwargs: padding.


28.0435


Unused or unrecognized kwargs: padding.


32.0884


Unused or unrecognized kwargs: padding.


31.4324


Unused or unrecognized kwargs: padding.


30.8285


Unused or unrecognized kwargs: padding.


30.2037


Unused or unrecognized kwargs: padding.


29.5009


Unused or unrecognized kwargs: padding.


29.3453


Unused or unrecognized kwargs: padding.


27.8252


Unused or unrecognized kwargs: padding.


34.5365


Unused or unrecognized kwargs: padding.


33.4632


Unused or unrecognized kwargs: padding.


30.5633


Unused or unrecognized kwargs: padding.


28.1788


Unused or unrecognized kwargs: padding.


34.7838


Unused or unrecognized kwargs: padding.


32.5201


Unused or unrecognized kwargs: padding.


29.8042


Unused or unrecognized kwargs: padding.


32.3167


Unused or unrecognized kwargs: padding.


34.9963


Unused or unrecognized kwargs: padding.


32.1244


Unused or unrecognized kwargs: padding.


35.6452


Unused or unrecognized kwargs: padding.


29.5545


Unused or unrecognized kwargs: padding.


27.8833


Unused or unrecognized kwargs: padding.


31.6157


Unused or unrecognized kwargs: padding.


31.8277


Unused or unrecognized kwargs: padding.


35.4374


Unused or unrecognized kwargs: padding.


34.1643


Unused or unrecognized kwargs: padding.


35.1847


Unused or unrecognized kwargs: padding.


35.5846


Unused or unrecognized kwargs: padding.


32.0361


Unused or unrecognized kwargs: padding.


38.3483


Unused or unrecognized kwargs: padding.


29.3821


Unused or unrecognized kwargs: padding.


34.8622


Unused or unrecognized kwargs: padding.


34.8952


Unused or unrecognized kwargs: padding.


34.7881


Unused or unrecognized kwargs: padding.


37.5516


Unused or unrecognized kwargs: padding.


35.5106


Unused or unrecognized kwargs: padding.


36.7698


Unused or unrecognized kwargs: padding.


35.2862


Unused or unrecognized kwargs: padding.


35.576


Unused or unrecognized kwargs: padding.


27.1213


Unused or unrecognized kwargs: padding.


29.1381


Unused or unrecognized kwargs: padding.


26.3949


Unused or unrecognized kwargs: padding.


30.3773


Unused or unrecognized kwargs: padding.


27.8902


Unused or unrecognized kwargs: padding.


30.6305


Unused or unrecognized kwargs: padding.


27.1002


Unused or unrecognized kwargs: padding.


30.185


Unused or unrecognized kwargs: padding.


31.4924


Unused or unrecognized kwargs: padding.


26.5593
31.74810599999998


In [None]:
total_clip_score

In [6]:
import torch
import clip
from PIL import Image

num_inference_steps = 75
total_hps = 0

for i in range(len(prompts)):
    prompt = prompts[i]
    
    v14Image = sdxl_dpo_lora_pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0).images[0]
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    
    image = preprocess(v14Image).unsqueeze(0).to(device)
    text = clip.tokenize(prompt).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
    
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
        hps = image_features @ text_features.T
        print(hps)
    
    total_hps += hps

print(total_hps/100)

tensor([[0.3726]], device='cuda:0', dtype=torch.float16)
tensor([[0.3303]], device='cuda:0', dtype=torch.float16)
tensor([[0.2935]], device='cuda:0', dtype=torch.float16)
tensor([[0.3633]], device='cuda:0', dtype=torch.float16)
tensor([[0.3550]], device='cuda:0', dtype=torch.float16)
tensor([[0.3364]], device='cuda:0', dtype=torch.float16)
tensor([[0.3403]], device='cuda:0', dtype=torch.float16)
tensor([[0.3345]], device='cuda:0', dtype=torch.float16)
tensor([[0.3462]], device='cuda:0', dtype=torch.float16)
tensor([[0.3086]], device='cuda:0', dtype=torch.float16)
tensor([[0.3486]], device='cuda:0', dtype=torch.float16)
tensor([[0.3364]], device='cuda:0', dtype=torch.float16)
tensor([[0.3374]], device='cuda:0', dtype=torch.float16)
tensor([[0.3047]], device='cuda:0', dtype=torch.float16)
tensor([[0.3477]], device='cuda:0', dtype=torch.float16)
tensor([[0.2510]], device='cuda:0', dtype=torch.float16)
tensor([[0.3022]], device='cuda:0', dtype=torch.float16)
tensor([[0.3059]], device='cuda

In [10]:

num_inference_steps = 75
total_image_reward = 0
total_aesthetic_score = 0


import torch
import ImageReward as reward

sdxl_dpo_lora_pipe.safety_checker = None

for i in range(len(prompts)):
    prompt = prompts[i]
    
    
    v14Image = sdxl_dpo_lora_pipe(disable_tqdm = True,prompt=prompt,num_images_per_prompt=1, num_inference_steps=num_inference_steps, guidance_scale=8.0).images[0]
    sdxl_dpo_lora_pipe.set_progress_bar_config(disable=True)
    
    image = preprocess(v14Image).unsqueeze(0).to(device)
    imageReward = reward.load("ImageReward-v1.0")
    with torch.no_grad():
        v14_image_reward = imageReward.score(prompt, v14Image)
        image_features = model2.encode_image(image)
    
    total_image_reward += v14_image_reward
    print("image reward: ", v14_image_reward)
    im_emb_arr = normalized(image_features.cpu().detach().numpy() )
    v14_aesthetic_score = aestheticPredictorModel(torch.from_numpy(im_emb_arr).to(device).type(torch.cuda.FloatTensor))
    total_aesthetic_score += v14_aesthetic_score
    print("aesthetic score: ", v14_aesthetic_score)


print("==================total=================")
print("average image reward: ")
print(total_image_reward/100)
print("average aesthetic score: ")
print(total_aesthetic_score/100)



load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  1.7381075620651245
aesthetic score:  tensor([[5.7677]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  1.7845016717910767
aesthetic score:  tensor([[6.2547]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  0.3458864390850067
aesthetic score:  tensor([[5.4840]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  1.4757494926452637
aesthetic score:  tensor([[6.1185]], device='cuda:0', grad_fn=<AddmmBackward0>)
load checkpoint from /home/research/jenzheng/.cache/ImageReward/ImageReward.pt
checkpoint loaded
image reward:  1.876319169998169
aesthetic score:  tensor([