In [1]:
# Install necessary packages
!pip install --quiet --upgrade diffusers transformers accelerate mediapy datasets fsspec gcsfs

# Restart the runtime after installations
import IPython
IPython.display.clear_output()
print("Installation complete. Please restart the runtime to continue.")

Installation complete. Please restart the runtime to continue.


In [1]:
# Import necessary libraries
import mediapy as media
import random
import sys
import torch
import time
import os
from PIL import Image
from datasets import load_dataset
from torchvision import transforms
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from diffusers import (
    DiffusionPipeline,
    DDPMScheduler,
    StableDiffusionImg2ImgPipeline,
)
from diffusers.models.attention_processor import LoRAAttnProcessor, AttnProcessor

# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Using device: cuda


In [2]:
# Load a smaller dataset and select the 'train' split
dataset = load_dataset("poloclub/diffusiondb", "2m_first_1k")['train']

# Define fantasy/adventure keywords
keywords = [
    "fantasy", "dragon", "castle", "wizard", "magic", "adventure", "sword", "elf",
    "mythical", "legend", "quest", "kingdom", "battle", "hero", "sorcerer", "knight",
    "fairy", "goblin", "orc", "dwarf", "epic", "spell", "enchant", "troll", "giant",
    "unicorn", "phoenix", "griffin", "saga", "tale", "myth", "witch", "warlock",
    "alchemy", "prophecy", "chronicle", "beast", "monster", "dungeon", "warrior",
    "ancient", "mystic", "artifact", "spellbook", "darkness", "rune", "shadow", "realm",
    "legendary", "folklore", "creature", "epic battle", "magical", "enchanted", "sorcery"
]

# Filter the dataset based on keywords
def filter_fantasy_adventure(example):
    text = example['prompt'].lower()
    return any(keyword in text for keyword in keywords)

fantasy_dataset = dataset.filter(filter_fantasy_adventure)
print(f"Number of fantasy/adventure images: {len(fantasy_dataset)}")

def download_and_preprocess_images(dataset, num_images=50):
    preprocess = transforms.Compose([
        transforms.Resize((512, 512)),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5]),
    ])

    images = []
    prompts = []

    for i, sample in enumerate(dataset.select(range(num_images))):
        image_data = sample['image']
        prompt = sample['prompt']
        try:
            # Convert PIL Image to tensor
            img = preprocess(image_data)
            images.append(img)
            prompts.append(prompt)
            print(f"Processed Image {i + 1}")
        except Exception as e:
            print(f"Error processing Image {i + 1}: {e}")
        if len(images) >= num_images:
            break

    return images, prompts

# Download and preprocess images
num_images = 50  # Adjust based on Colab's capacity
print("\nProcessing images...")
images, prompts = download_and_preprocess_images(fantasy_dataset, num_images=num_images)
print(f"Processed {len(images)} images.")

# Create a custom dataset
class CustomImageDataset(torch.utils.data.Dataset):
    def __init__(self, images, prompts):
        self.images = images
        self.prompts = prompts

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        prompt = self.prompts[idx]
        return {'pixel_values': image, 'prompt': prompt}

train_dataset = CustomImageDataset(images, prompts)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/25.0k [00:00<?, ?B/s]

diffusiondb.py:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

The repository for poloclub/diffusiondb contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/poloclub/diffusiondb.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


part-000002.zip:   0%|          | 0.00/581M [00:00<?, ?B/s]

metadata.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Number of fantasy/adventure images: 311

Processing images...
Processed Image 1
Processed Image 2
Processed Image 3
Processed Image 4
Processed Image 5
Processed Image 6
Processed Image 7
Processed Image 8
Processed Image 9
Processed Image 10
Processed Image 11
Processed Image 12
Processed Image 13
Processed Image 14
Processed Image 15
Processed Image 16
Processed Image 17
Processed Image 18
Processed Image 19
Processed Image 20
Processed Image 21
Processed Image 22
Processed Image 23
Processed Image 24
Processed Image 25
Processed Image 26
Processed Image 27
Processed Image 28
Processed Image 29
Processed Image 30
Processed Image 31
Processed Image 32
Processed Image 33
Processed Image 34
Processed Image 35
Processed Image 36
Processed Image 37
Processed Image 38
Processed Image 39
Processed Image 40
Processed Image 41
Processed Image 42
Processed Image 43
Processed Image 44
Processed Image 45
Processed Image 46
Processed Image 47
Processed Image 48
Processed Image 49
Processed Image 

In [3]:
# Load the pre-trained Stable Diffusion model
base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
pipe = DiffusionPipeline.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    variant="fp16",
).to(device)

unet = pipe.unet
unet.train()

model_index.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Fetching 19 files:   0%|          | 0/19 [00:00<?, ?it/s]

model.fp16.safetensors:   0%|          | 0.00/1.39G [00:00<?, ?B/s]

model.fp16.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

text_encoder_2/config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer_2/special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer_2/tokenizer_config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/5.14G [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

UNet2DConditionModel(
  (conv_in): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (time_proj): Timesteps()
  (time_embedding): TimestepEmbedding(
    (linear_1): Linear(in_features=320, out_features=1280, bias=True)
    (act): SiLU()
    (linear_2): Linear(in_features=1280, out_features=1280, bias=True)
  )
  (add_time_proj): Timesteps()
  (add_embedding): TimestepEmbedding(
    (linear_1): Linear(in_features=2816, out_features=1280, bias=True)
    (act): SiLU()
    (linear_2): Linear(in_features=1280, out_features=1280, bias=True)
  )
  (down_blocks): ModuleList(
    (0): DownBlock2D(
      (resnets): ModuleList(
        (0-1): 2 x ResnetBlock2D(
          (norm1): GroupNorm(32, 320, eps=1e-05, affine=True)
          (conv1): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)
          (norm2): GroupNorm(32, 320, eps=1e-05, affine=True)
          (dropout): Dropout(p=

In [4]:
# Set up LoRA for the UNet's attention processors
from diffusers.models.attention_processor import LoRAAttnProcessor

lora_rank = 16  # You can adjust this value

# Create a dictionary to hold the new attention processors
lora_attn_procs = {}
for name, module in unet.attn_processors.items():
    # Initialize LoRAAttnProcessor without arguments
    if isinstance(module, AttnProcessor):
        lora_attn_procs[name] = LoRAAttnProcessor()
    else:
        lora_attn_procs[name] = module  # Keep other processors unchanged

# Set the attention processors to the UNet
unet.set_attn_processor(lora_attn_procs)

# Freeze all parameters except LoRA layers
for param in unet.parameters():
    param.requires_grad = False

# Set requires_grad = True for LoRA parameters
for module in unet.attn_processors.values():
    if isinstance(module, LoRAAttnProcessor):
        for param in module.parameters():
            param.requires_grad = True
        # Set the rank if necessary
        module.rank = lora_rank

# Confirm that there are trainable parameters
trainable_params = []
for name, param in unet.named_parameters():
    if param.requires_grad:
        trainable_params.append((name, param))

if not trainable_params:
    print("No trainable parameters found in the model.")
else:
    print(f"Number of trainable parameters: {len(trainable_params)}")
    print("Trainable parameters:")
    for name, param in trainable_params:
        print(f"- {name}, shape: {param.shape}")


# Training parameters
batch_size = 1  # Adjust based on GPU memory
num_epochs = 1  # Increase if you have more time/resources
learning_rate = 1e-4

# DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Optimizer
optimizer = AdamW(filter(lambda p: p.requires_grad, unet.parameters()), lr=learning_rate)

# Scheduler
noise_scheduler = DDPMScheduler.from_config(pipe.scheduler.config)

# Define the training loop
def train_one_epoch(unet, dataloader, optimizer, noise_scheduler):
    unet.train()
    for batch in tqdm(dataloader):
        pixel_values = batch['pixel_values'].to(device, dtype=torch.float16)
        prompt = batch['prompt']

        # Encode the prompt
        text_input = pipe.tokenizer(
            prompt,
            padding="max_length",
            max_length=pipe.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt"
        )
        encoder_hidden_states = pipe.text_encoder(text_input.input_ids.to(device))[0]

        # Sample noise
        noise = torch.randn_like(pixel_values).to(device)

        # Sample random timesteps
        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (pixel_values.shape[0],), device=device).long()

        # Add noise to images
        noisy_images = noise_scheduler.add_noise(pixel_values, noise, timesteps)

        # Predict the noise residual
        model_pred = unet(noisy_images, timesteps, encoder_hidden_states).sample

        # Compute loss
        loss = nn.functional.mse_loss(model_pred.float(), noise.float())

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Optionally, print loss
        # print(f"Loss: {loss.item()}")

# Train the model
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_one_epoch(unet, train_dataloader, optimizer, noise_scheduler)

# Save the LoRA weights
output_dir = "./fine_tuned_lora"
os.makedirs(output_dir, exist_ok=True)
unet.save_attn_procs(output_dir)

# Load the fine-tuned LoRA weights
pipe.unet.load_attn_procs(output_dir)
pipe.unet.to(device)
pipe.unet.eval()

No trainable parameters found in the model.


ValueError: optimizer got an empty parameter list

In [None]:
# Generate the prompt for the scene
initial_description =  """
Under a moonless sky, Jonathan Harker, a newly qualified English solicitor, arrives at the ancient, towering castle of Count Dracula in the remote Carpathian Mountains. The chill night air is heavy with mist as massive wooden doors swing open. Count Dracula, a tall, gaunt figure with piercing eyes and a sly smile, greets him warmly, yet something unsettling lingers in the air.
"""

seed = random.randint(0, sys.maxsize)
guidance_scale = 5.0
num_inference_steps = 50  # Increase for better quality
image_width, image_height = 512, 512  # Adjust based on GPU memory

start_time = time.time()

# Generate images using the fine-tuned model
generator = torch.Generator(device).manual_seed(seed)
images = pipe(
    prompt=initial_description,
    num_inference_steps=num_inference_steps,
    guidance_scale=guidance_scale,
    generator=generator,
    height=image_height,
    width=image_width
).images

end_time = time.time()
execution_time = end_time - start_time

print(f"Prompt:\t{initial_description}\nSeed:\t{seed}")
print(f"Execution Time: {execution_time:.2f} seconds")
media.show_images(images)
images[0].save("output.jpg")

In [None]:
# Continue with your img2img pipeline
pipe_img2img = StableDiffusionImg2ImgPipeline.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    variant="fp16",
).to(device)

# Load the fine-tuned LoRA weights
pipe_img2img.unet.load_attn_procs(output_dir)
pipe_img2img.unet.to(device)
pipe_img2img.unet.eval()

init_image = Image.open("output.jpg").convert("RGB")

prompt = "Unable to resist his growing curiosity, Jonathan Harker decides to explore the castle despite the Count's warnings. Suddenly, three enigmatic women with pale skin and captivating eyes appear, moving toward him with a mesmerizing grace."

images = pipe_img2img(
    prompt=prompt,
    image=init_image,
    strength=0.75,
    guidance_scale=10,
    num_inference_steps=50,  # Increase for better quality
    height=512,
    width=512  # Adjust based on GPU memory
).images

media.show_images(images)
images[0].save("output2.jpg")

# Next scene
prompt = "Just as one of the women reaches out to touch him, a sudden gust of icy wind extinguishes the nearby candles, plunging the corridor into darkness. Jonathan's heart races as he feels an oppressive presence behind him. The candles flare back to life, revealing Count Dracula standing between him and the women, his eyes blazing with a stern warning."

images = pipe_img2img(
    prompt=prompt,
    image=init_image,
    strength=0.75,
    guidance_scale=9,
    num_inference_steps=50,  # Increase for better quality
    height=512,
    width=512
).images

media.show_images(images)
images[0].save("output3.jpg")