In [1]:
!pip install diffusers
!pip install accelerate
!pip install torchmetrics
!pip install torch-fidelity
!pip install torchmetrics[image]




In [2]:
# Imports

import torch
from diffusers import StableDiffusionPipeline
import matplotlib.pyplot as plt
from torchmetrics.functional.multimodal import clip_score
from functools import partial

from PIL import Image
import os
import numpy as np

from torchvision.transforms import functional as F
from torchmetrics.image.fid import FrechetInceptionDistance


In [3]:
# Configure base model pipeline
base_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
base_pipe = base_pipe.to("cuda")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [4]:
prompts = [
    "A photo of a baseball player throwing a pitch",
    "A sports player standing on a mound throwing a pitch",
    "A photo of throwing a pitch",
    "A realistic photo of throwing a pitch",
    "A woman with brown hair throwing a pitch",
]

images_base = base_pipe(prompts, num_images_per_prompt=3, output_type="np").images


for i in range(images_base.shape[0]):
    plt.imshow(images_base[i])
    plt.axis('off')  # Turn off axis labels
    plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [5]:
# Calculate CLIP Score for base SD model:
clip_score_fn = partial(clip_score, model_name_or_path="openai/clip-vit-base-patch16")

def calculate_clip_score(images_base, prompts):
    images_int = (images_base * 255).astype("uint8")
    clip_score = clip_score_fn(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
    return round(float(clip_score), 4)

prompts = [prompt for prompt in prompts for _ in range(3)]


sd_clip_score = calculate_clip_score(images_base, prompts)
print(f"CLIP score: {sd_clip_score}")

CLIP score: 31.3535


# Textual Inversion CLIP Evaluation

In [6]:
# Add textual inversion to model pipeline
textual_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
textual_pipe = textual_pipe.to("cuda")
textual_pipe.load_textual_inversion("agangal/text-inversion-model-baseball")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [7]:
prompts_text = [
    "A photo of a baseball player <baseball-pitching>",
    "A sports player standing on a mound <baseball-pitching>",
    "A photo of <baseball-pitching>",
    "A realistic photo of <baseball-pitching>",
    "A woman with brown hair <baseball-pitching>",
]

images_text = textual_pipe(prompts, num_images_per_prompt=1, output_type="np").images

for i in range(images_text.shape[0]):
    plt.imshow(images_text[i])
    plt.axis('off')  # Turn off axis labels
    plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [8]:
# Calculate CLIP Score for textual inversion SD model
prompts_text = [prompt for prompt in prompts_text for _ in range(3)]
text_clip_score = calculate_clip_score(images_text, prompts_text)
print(f"CLIP score: {text_clip_score}")

CLIP score: 32.0159


# LoRA CLIP Evaluation

In [9]:
lora_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
lora_pipe = lora_pipe.to("cuda")
lora_pipe.load_lora_weights("agangal/sd-model-finetuned-lora-baseball-pitching")

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.
You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet.{module_name}': params for module_name, params in old_state_dict.items()}`.


In [10]:
# Generate samples
prompts_lora = [
    "A photo of a baseball player throwing a pitch",
    "A sports player standing on a mound throwing a pitch",
    "A photo of throwing a pitch",
    "A realistic photo of throwing a pitch",
    "A woman with brown hair throwing a pitch",
]

images_lora = lora_pipe(prompts, num_images_per_prompt=3, output_type="np").images


for i in range(images_lora.shape[0]):
    plt.imshow(images_lora[i])
    plt.axis('off')  # Turn off axis labels
    plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [11]:
# Calculate CLIP Score for LoRA SD model
prompts_lora = [prompt for prompt in prompts_lora for _ in range(3)]
text_clip_score = calculate_clip_score(images_lora, prompts_lora)
print(f"CLIP score: {text_clip_score}")

CLIP score: 31.1417


# FID Evaluation

In [12]:
# FID Score
# First, upload and grab real images (choice of 15)
dataset_path = "real_images"
image_paths = sorted([os.path.join(dataset_path, x) for x in os.listdir(dataset_path)])

real_images = [np.array(Image.open(path).convert("RGB")) for path in image_paths]

In [13]:
# Preprocess real images
def preprocess_image(image):
    image = torch.tensor(image).unsqueeze(0)
    image = image.permute(0, 3, 1, 2) / 255.0
    return image

real_images = torch.cat([preprocess_image(image) for image in real_images])
print(real_images.shape)

torch.Size([15, 3, 512, 512])


In [14]:
# preprocess fake images
images_base = torch.tensor(images_base)
images_base = images_base.permute(0, 3, 1, 2)
print(images_base.shape)

images_text = torch.tensor(images_text)
images_text = images_text.permute(0, 3, 1, 2)
print(images_text.shape)

images_lora = torch.tensor(images_lora)
images_lora = images_lora.permute(0, 3, 1, 2)
print(images_lora.shape)

torch.Size([15, 3, 512, 512])
torch.Size([15, 3, 512, 512])
torch.Size([15, 3, 512, 512])


In [15]:
# Compute FID
import torch_fidelity
fid_base = FrechetInceptionDistance(normalize=True)
fid_base.update(real_images, real=True)
fid_base.update(images_base, real=False)

print(f"FID for base model: {float(fid_base.compute())}")

fid_text = FrechetInceptionDistance(normalize=True)
fid_text.update(real_images, real=True)
fid_text.update(images_text, real=False)

print(f"FID for textual inversion model: {float(fid_text.compute())}")

fid_lora = FrechetInceptionDistance(normalize=True)
fid_lora.update(real_images, real=True)
fid_lora.update(images_lora, real=False)

print(f"FID for LoRA model: {float(fid_lora.compute())}")

Downloading: "https://github.com/toshas/torch-fidelity/releases/download/v0.2.0/weights-inception-2015-12-05-6726825d.pth" to /root/.cache/torch/hub/checkpoints/weights-inception-2015-12-05-6726825d.pth
100%|██████████| 91.2M/91.2M [00:00<00:00, 293MB/s]


FID for base model: 92.65239715576172
FID for textual inversion model: 116.80029296875
FID for LoRA model: 71.42942810058594
