In [1]:
!pip install diffusers --upgrade
!pip install invisible_watermark transformers accelerate safetensors

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable




In [2]:
from diffusers import DiffusionPipeline, StableDiffusionPipeline
import transformers

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.transforms import RandAugment
from IPython.core.debugger import set_trace
import os

import numpy as np
import matplotlib.pyplot as plt
import math
from PIL import Image

In [3]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the model to use off huggingface
model_id = "CompVis/stable-diffusion-v1-4"

# Path to directory containing images of the subject we want to use dreambooth on
dataset_path = '/home/ahm247/dreambooth/dataset/dog6'

# Path to our 200 photos of our prior found online. For another class generate the data
classes_path = '/home/ahm247/dreambooth/class-images'

# Prior and fine-tuning prompts
prior_prompt = 'A dog'
id_prompt = 'A sks dog'

In [4]:
# Define Dataset class for prior images
class CustomImageDataset(torch.utils.data.Dataset):
    def __init__(self, directory, transform=None):
        self.directory = directory 
        self.transform = transform
        self.image_paths = [os.path.join(directory, filename) for filename in os.listdir(directory)]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')
    
        if self.transform:
            image = self.transform(image)

        return image


In [5]:
device

device(type='cuda')

In [6]:
finetuned_pipe = StableDiffusionPipeline.from_pretrained(model_id, 
                                                torch_dtype=torch.float16,
                                                use_safetensors=True,
                                                variant="fp16")
finetuned_pipe.to(device)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.27.2",
  "_name_or_path": "CompVis/stable-diffusion-v1-4",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    null,
    null
  ],
  "requires_safety_checker": true,
  "safety_checker": [
    "stable_diffusion",
    "StableDiffusionSafetyChecker"
  ],
  "scheduler": [
    "diffusers",
    "PNDMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [7]:
prior_token = finetuned_pipe.tokenizer(prior_prompt, return_tensors='pt').to(device)
id_token = finetuned_pipe.tokenizer(id_prompt, return_tensors='pt').to(device)

prior_input_ids = prior_token['input_ids']
prior_attention_masks = prior_token['attention_mask']

with torch.no_grad():
    prior_encoder_hidden = finetuned_pipe.text_encoder(input_ids=prior_input_ids, attention_mask=prior_attention_masks)
    
id_input_ids = id_token['input_ids']
id_attention_masks = id_token['attention_mask']

with torch.no_grad(): 
    id_encoder_hidden = finetuned_pipe.text_encoder(input_ids=id_input_ids, attention_mask=id_attention_masks)
    
print(type(prior_encoder_hidden))
print(type(id_encoder_hidden))
# Setting up the datasets/dataloaders
transform = transforms.Compose([
    transforms.Resize((512,512)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

prior_dataset = CustomImageDataset(directory=classes_path, transform=transform)
id_dataset = CustomImageDataset(directory=dataset_path, transform=transform)
prior_dataloader = torch.utils.data.DataLoader(prior_dataset, batch_size=1, shuffle=True)
id_dataloader = torch.utils.data.DataLoader(id_dataset, batch_size=1, shuffle=True)

<class 'transformers.modeling_outputs.BaseModelOutputWithPooling'>
<class 'transformers.modeling_outputs.BaseModelOutputWithPooling'>


In [8]:
print(type(prior_encoder_hidden.last_hidden_state))
print(type(id_encoder_hidden.last_hidden_state))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [9]:
NUM_EPOCHS = 10
accumulation_steps = 4
finetuned_pipe.unet.train()
optimizer = optim.AdamW(finetuned_pipe.unet.parameters(), 
                        lr=5e-6,
                        betas=(0.9,0.999),
                        weight_decay=1e-2,
                        eps=1e-08)
mse_loss = nn.MSELoss()
max_timesteps = finetuned_pipe.scheduler.num_train_timesteps

for epoch in range(NUM_EPOCHS):

    for (prior_images, id_images) in zip(prior_dataloader, id_dataloader):
        prior_images = prior_images.to(torch.float16).to(device)
        id_images = id_images.to(torch.float16).to(device)

        prior_latent = finetuned_pipe.vae.encode(prior_images).latent_dist.sample()
        prior_latent *= 0.18215
        noisy_prior_latent = prior_latent + torch.randn_like(prior_latent)

        id_latent = finetuned_pipe.vae.encode(id_images).latent_dist.sample()
        id_latent *= 0.18215
        noisy_id_latent = id_latent + torch.randn_like(id_latent)

        denoised_prior_latent = finetuned_pipe.unet(noisy_prior_latent, timestep=max_timesteps, encoder_hidden_states=prior_encoder_hidden.last_hidden_state)
        denoised_id_latent = finetuned_pipe.unet(noisy_id_latent, timestep=max_timesteps, encoder_hidden_states=id_encoder_hidden.last_hidden_state) 

        # Forward pass
        optimizer.zero_grad()

        # Calculate loss
        loss_id = mse_loss(denoised_id_latent.sample, id_latent)
        loss_pr = mse_loss(denoised_prior_latent.sample, prior_latent)
        loss = loss_id + loss_pr

        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

  deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)


tensor(False, device='cuda:0')
tensor(False, device='cuda:0')
Epoch 1, Loss: 2.938199758529663


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 22.16 GiB of which 8.12 MiB is free. Including non-PyTorch memory, this process has 22.15 GiB memory in use. Of the allocated memory 21.01 GiB is allocated by PyTorch, and 56.16 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)