In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from diffusers import StableDiffusionPipeline, UNet2DConditionModel, AutoencoderKL
from diffusers.models.attention_processor import LoRAAttnProcessor

from transformers import CLIPTokenizer, CLIPTextModel

from PIL import Image
from pathlib import Path
import os
from tqdm import tqdm

import numpy as np


In [2]:

model_id = "CompVis/stable-diffusion-v1-4"

pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    revision="fp16",
).to("cuda")

# Freeze VAE and Text Encoder
pipe.vae.requires_grad_(False)
pipe.text_encoder.requires_grad_(False)

unet = pipe.unet
tokenizer = pipe.tokenizer
text_encoder = pipe.text_encoder

 The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title 'CompVis/stable-diffusion-v1-4 is missing fp16 files' so that the correct variant file can be added.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

An error occurred while trying to fetch C:\Users\adity\.cache\huggingface\hub\models--CompVis--stable-diffusion-v1-4\snapshots\2880f2ca379f41b0226444936bb7a6766a227587\unet: Error no file named diffusion_pytorch_model.safetensors found in directory C:\Users\adity\.cache\huggingface\hub\models--CompVis--stable-diffusion-v1-4\snapshots\2880f2ca379f41b0226444936bb7a6766a227587\unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
An error occurred while trying to fetch C:\Users\adity\.cache\huggingface\hub\models--CompVis--stable-diffusion-v1-4\snapshots\2880f2ca379f41b0226444936bb7a6766a227587\vae: Error no file named diffusion_pytorch_model.safetensors found in directory C:\Users\adity\.cache\huggingface\hub\models--CompVis--stable-diffusion-v1-4\snapshots\2880f2ca379f41b0226444936bb7a6766a227587\vae.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.


In [5]:
# Set LoRA Adapters for UNet
rank = 4  # LoRA rank

# Iterate through all attention processors
for name, module in unet.attn_processors.items():
    if isinstance(module, LoRAAttnProcessor):
        continue  # Already LoRA
    cross_attention_dim = module.cross_attention_dim if hasattr(module, "cross_attention_dim") else None
    hidden_size = module.hidden_size if hasattr(module, "hidden_size") else None

    if cross_attention_dim is None or hidden_size is None:
        continue

    # Create LoRA processor
    lora_attn_processor = LoRAAttnProcessor(
        hidden_size=hidden_size,
        cross_attention_dim=cross_attention_dim,
        rank=rank
    )

    # Set it
    unet.set_attn_processor(name, lora_attn_processor)

print("LoRA injected successfully into UNet!")


LoRA injected successfully into UNet!


In [6]:
class BottleDefectDataset(Dataset):
    def __init__(self, image_paths, captions, tokenizer, resolution=(512,512)):
        self.image_paths = image_paths
        self.captions = captions
        self.tokenizer = tokenizer
        self.resolution = resolution

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        image = image.resize(self.resolution)
        image = np.array(image).astype(np.float32) / 255.0
        image = torch.tensor(image).permute(2,0,1)

        inputs = self.tokenizer(
            self.captions[idx],
            padding="max_length",
            max_length=77,
            truncation=True,
            return_tensors="pt"
        )

        return {
            "pixel_values": image,
            "input_ids": inputs.input_ids.squeeze(0),
            "attention_mask": inputs.attention_mask.squeeze(0)
        }


In [7]:
path_1, path_2 = "../dataset/bottle/image/broken_large-000.png", "../dataset/bottle/image/broken_large-001.png"
caption_1, caption_2 = "../dataset/bottle/image/broken_large-000.txt", "../dataset/bottle/image/broken_large-001.txt"
train_dataset = BottleDefectDataset(
    image_paths=[path_1, path_2],  # your 2 images per defect
    captions=[caption_1, caption_2], 
    tokenizer=tokenizer
)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

In [8]:
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, unet.parameters()), lr=1e-4)

In [9]:
num_epochs = 50

unet.train()
for epoch in range(num_epochs):
    pbar = tqdm(train_dataloader)
    for batch in pbar:
        optimizer.zero_grad()

        latents = pipe.vae.encode(batch["pixel_values"].to("cuda").half()).latent_dist.sample()
        latents = latents * 0.18215  # VAE scaling

        encoder_hidden_states = pipe.text_encoder(batch["input_ids"].to("cuda"))[0]

        noise = torch.randn_like(latents)
        timesteps = torch.randint(0, 1000, (latents.shape[0],), device=latents.device).long()

        noisy_latents = pipe.scheduler.add_noise(latents, noise, timesteps)
        noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample

        loss = F.mse_loss(noise_pred, noise)
        loss.backward()

        optimizer.step()

        pbar.set_description(f"Loss: {loss.item():.4f}")


Loss: nan: 100%|██████████| 2/2 [00:43<00:00, 21.65s/it]   
Loss: nan: 100%|██████████| 2/2 [00:36<00:00, 18.38s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.76s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.53s/it]
Loss: nan: 100%|██████████| 2/2 [00:36<00:00, 18.41s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.56s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.54s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.85s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.58s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.58s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.58s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.99s/it]
Loss: nan: 100%|██████████| 2/2 [00:36<00:00, 18.18s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.99s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.89s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.55s/it]
Loss: nan: 100%|██████████| 2/2 [00:37<00:00, 18.63s/it]
Loss: nan: 100%|██████████| 

In [15]:
# unet.save_lora_adapter("saved_lora_adapters/", adapter_name="damage_lora")

In [None]:
# pipe.unet.load_attn_procs("saved_lora_adapters/")

In [21]:
# Now prompt new synthetic images

prompt = "A damaged bottle with a crack at the bottom"
negative_prompt = "blurry, low quality, distorted"

# Generate synthetic defect image
# pipe.safety_checker = None
pipe.safety_checker = None # lambda images, clip_input: (images, False)

image = pipe(
    prompt,
    # negative_prompt=negative_prompt,
    num_inference_steps=5,
    guidance_scale=1,
).images[0]

# Show the image
image.show()

# Save if needed
image.save("synthetic_damage_sample.png")

  0%|          | 0/5 [00:00<?, ?it/s]

# Trying to train the model on all images

In [24]:
model_id = "CompVis/stable-diffusion-v1-4"

pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    revision="fp16",
).to("cuda")

# Now, processor is:
processor = pipe

processor = {
    "image_processor": pipe.feature_extractor,  # For images
    "tokenizer": pipe.tokenizer,                # For captions
}

 The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title 'CompVis/stable-diffusion-v1-4 is missing fp16 files' so that the correct variant file can be added.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

An error occurred while trying to fetch C:\Users\adity\.cache\huggingface\hub\models--CompVis--stable-diffusion-v1-4\snapshots\2880f2ca379f41b0226444936bb7a6766a227587\unet: Error no file named diffusion_pytorch_model.safetensors found in directory C:\Users\adity\.cache\huggingface\hub\models--CompVis--stable-diffusion-v1-4\snapshots\2880f2ca379f41b0226444936bb7a6766a227587\unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
An error occurred while trying to fetch C:\Users\adity\.cache\huggingface\hub\models--CompVis--stable-diffusion-v1-4\snapshots\2880f2ca379f41b0226444936bb7a6766a227587\vae: Error no file named diffusion_pytorch_model.safetensors found in directory C:\Users\adity\.cache\huggingface\hub\models--CompVis--stable-diffusion-v1-4\snapshots\2880f2ca379f41b0226444936bb7a6766a227587\vae.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.


In [22]:
class DefectImageCaptionDataset(Dataset):
    def __init__(self, data_folder, processor, size=(512, 512)):
        self.data_folder = data_folder
        self.processor = processor
        self.size = size

        # List all image files
        self.image_files = [f for f in os.listdir(data_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        # Image filename
        image_filename = self.image_files[idx]
        image_path = os.path.join(self.data_folder, image_filename)

        # Corresponding caption filename
        caption_filename = os.path.splitext(image_filename)[0] + '.txt'
        caption_path = os.path.join(self.data_folder, caption_filename)

        # Load image
        image = Image.open(image_path).convert("RGB")
        image = image.resize(self.size)

        # Load caption
        with open(caption_path, 'r', encoding='utf-8') as f:
            caption = f.read().strip()

        # Processor
        inputs = self.processor(images=image, text=caption, return_tensors="pt")

        return {
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "input_ids": inputs["input_ids"].squeeze(0),
        }


In [25]:
dataset = DefectImageCaptionDataset(
    data_folder="../dataset/bottle/image/",
    processor=processor
)

train_dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [26]:
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, unet.parameters()), lr=0.001)

num_epochs = 20

unet.train()
for epoch in range(num_epochs):
    pbar = tqdm(train_dataloader)
    for batch in pbar:
        optimizer.zero_grad()

        latents = pipe.vae.encode(batch["pixel_values"].to("cuda").half()).latent_dist.sample()
        latents = latents * 0.18215  # VAE scaling

        encoder_hidden_states = pipe.text_encoder(batch["input_ids"].to("cuda"))[0]

        noise = torch.randn_like(latents)
        timesteps = torch.randint(0, 1000, (latents.shape[0],), device=latents.device).long()

        noisy_latents = pipe.scheduler.add_noise(latents, noise, timesteps)
        noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample

        loss = F.mse_loss(noise_pred, noise)
        loss.backward()

        optimizer.step()

        pbar.set_description(f"Loss: {loss.item():.4f}")

  0%|          | 0/32 [00:00<?, ?it/s]


TypeError: 'dict' object is not callable