<a href="https://colab.research.google.com/github/adityamhamunkar/imagefinetuning/blob/main/Untitled63.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# train_lora.py

import argparse
import logging
import os
from pathlib import Path

import torch
from accelerate import Accelerator
from accelerate.logging import get_logger
from datasets import load_dataset
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
from diffusers.optimization import get_scheduler
from diffusers.utils import check_min_version
from huggingface_hub import create_repo, upload_folder
from peft import LoraConfig, get_peft_model
from torchvision import transforms
from tqdm.auto import tqdm

# Will error if the minimal version of diffusers is not installed. Remove at your own peril.
check_min_version("0.29.0.dev0")

logger = get_logger(__name__)

def parse_args():
    parser = argparse.ArgumentParser(description="Simple example of a LoRA fine-tuning script.")
    parser.add_argument(
        "--pretrained_model_name_or_path",
        type=str,
        default="runwayml/stable-diffusion-v1-5",
        help="Path to pretrained model or model identifier from huggingface.co/models.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="lora_ring_finetuned",
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--train_data_dir",
        type=str,
        required=True,
        help="A folder containing the training data. The images should be in this folder.",
    )
    parser.add_argument(
        "--resolution",
        type=int,
        default=512,
        help=(
            "The resolution for input images, all the images in the dataset will be resized to this resolution. "
            "The training script will not resize the images if set to 0."
        ),
    )
    parser.add_argument(
        "--center_crop",
        default=False,
        action="store_true",
        help=(
            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
            " cropped. The images will be resized to the resolution first before cropping."
        ),
    )
    parser.add_argument(
        "--random_flip",
        default=False,
        action="store_true",
        help="whether to randomly flip images horizontally",
    )
    parser.add_argument(
        "--train_batch_size", type=int, default=1, help="Batch size (per device) for the training dataloader."
    )
    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--learning_rate",
        type=float,
        default=1e-4,
        help="Initial learning rate (after the potential warmup period) to use.",
    )
    parser.add_argument(
        "--lr_scheduler_type",
        type=str,
        default="constant",
        help=(
            "The scheduler type to use. Choose between ['linear', 'cosine', 'cosine_with_restarts', 'polynomial',"
            " 'constant', 'constant_with_warmup']"
        ),
    )
    parser.add_argument(
        "--lr_warmup_steps", type=int, default=0, help="Number of steps for the warmup phase of the lr scheduler."
    )
    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
    parser.add_argument(
        "--dataloader_num_workers",
        type=int,
        default=0,
        help=(
            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
        ),
    )
    parser.add_argument(
        "--rank",
        type=int,
        default=4,
        help=("The LoRA rank parameter. Adjust for different adapter sizes."),
    )
    parser.add_argument(
        "--logging_dir",
        type=str,
        default="logs",
        help=(
            "[Weights & Biases] or [MLflow] log directory. Will default to"
            " `args.output_dir/runs/current_datetime_local`."
        ),
    )
    parser.add_argument(
        "--report_to",
        type=str,
        default="wandb",
        help=(
            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`, '
            '`"wandb"` (Weights & Biases), `"mlflow"`, and `"comet_ml"`. '
            'Use `"all"` to report to all the integrations.'
        ),
    )
    parser.add_argument(
        "--gradient_checkpointing",
        action="store_true",
        help="Whether or not to use gradient checkpointing to save memory at the expense of computational speed.",
    )
    parser.add_argument(
        "--mixed_precision",
        type=str,
        default=None,
        choices=["no", "fp16", "bf16"],
        help=(
            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
            " 1.10.0. and an Nvidia Ampere GPU.  Defaults to the value of accelerate config of the current system or the"
            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
        ),
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")

    args = parser.parse_args()
    return args

def main():
    args = parse_args()

    # Initialize accelerator
    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision,
        log_with=args.report_to,
        project_dir=args.logging_dir,
    )

    if accelerator.is_main_process:
        accelerator.init_trackers("lora_fine_tuning_ring", config=vars(args))

    # Load scheduler, tokenizer and models.
    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")

    # Freeze vae and unet parameters and apply LoRA to unet
    vae.requires_grad_(False)
    unet.requires_grad_(False)

    if args.gradient_checkpointing:
        unet.enable_gradient_checkpointing()

    # Create LoRA config
    lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank, # Typically set to r
        target_modules=["to_q", "to_k", "to_v", "to_out.0"], # Standard modules for Stable Diffusion UNet
        lora_dropout=0.0,
        bias="none",
    )

    # Apply LoRA to UNet
    unet = get_peft_model(unet, lora_config)
    unet.print_trainable_parameters()

    # Set up optimizer
    optimizer = torch.optim.AdamW(unet.parameters(), lr=args.learning_rate)

    # Load dataset
    # You'll need to create a dataset that loads your 10 ring images.
    # For simplicity, let's assume images are directly in `args.train_data_dir`.
    # A more robust solution might use a metadata file.
    dataset = load_dataset(
        "imagefolder",
        data_dir=args.train_data_dir,
        cache_dir="./cache", # Optional: specify a cache directory
    )

    # Preprocessing
    train_transforms = transforms.Compose(
        [
            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5]),
        ]
    )

    def preprocess_train(examples):
        images = [image.convert("RGB") for image in examples["image"]]
        examples["pixel_values"] = [train_transforms(image) for image in images]
        del examples["image"]
        return examples

    with accelerator.main_process_first():
        dataset["train"].set_transform(preprocess_train)

    train_dataloader = torch.utils.data.DataLoader(
        dataset["train"],
        batch_size=args.train_batch_size,
        shuffle=True,
        num_workers=args.dataloader_num_workers,
    )

    # Learning rate scheduler
    lr_scheduler = get_scheduler(
        args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
        num_training_steps=len(train_dataloader) * args.num_train_epochs,
    )

    # Prepare everything with accelerator
    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        unet, optimizer, train_dataloader, lr_scheduler
    )

    # For mixed precision training with bitsandbytes
    if args.mixed_precision == "fp16":
        from bitsandbytes.optim import AdamW8bit
        optimizer = AdamW8bit(unet.parameters(), lr=args.learning_rate)
        # Re-prepare the optimizer with accelerator if using bitsandbytes, as it might change the optimizer type
        optimizer = accelerator.prepare(optimizer)


    # Training loop
    global_step = 0
    for epoch in range(args.num_train_epochs):
        unet.train()
        train_loss = 0.0
        for step, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{args.num_train_epochs}", disable=not accelerator.is_local_main_process)):
            with accelerator.accumulate(unet):
                # Convert images to latent space
                latents = vae.encode(batch["pixel_values"].to(unet.device)).latent_dist.sample() * vae.config.scaling_factor

                # Sample noise that we'll add to the latents
                noise = torch.randn_like(latents)
                bsz = latents.shape[0]
                # Sample a random timestep for each image
                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device).long()

                # Add noise to the latents according to the noise magnitude at each timestep
                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

                # Get the text embedding for conditioning (not directly used in this basic example for image-only training, but often present in SD)
                # For unconditional training, you'd typically use empty text embeddings or no text encoder.
                # In a real scenario for ring style, you might have a generic prompt like "a ring"
                # This example assumes you're fine-tuning the UNet without specific text conditioning per image.
                # If you have captions, you would encode them here.
                encoder_hidden_states = torch.randn(bsz, 77, 768).to(unet.device) # Placeholder for unconditional generation

                # Predict the noise residual
                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample

                # Get the target for loss depending on the prediction type
                if noise_scheduler.config.prediction_type == "epsilon":
                    target = noise
                elif noise_scheduler.config.prediction_type == "v_prediction":
                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
                else:
                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")

                loss = torch.nn.functional.mse_loss(model_pred.float(), target.float(), reduction="mean")

                # Gather the losses across all processes for logging (if using distributed training)
                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
                train_loss += avg_loss.item() / args.gradient_accumulation_steps

                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            if accelerator.is_main_process:
                if step % 10 == 0: # Log every 10 steps (adjust as needed)
                    logs = {"loss": train_loss / (step + 1), "lr": lr_scheduler.get_last_lr()[0]}
                    accelerator.log(logs, step=global_step)
            global_step += 1

        accelerator.wait_for_everyone()

        # Save the adapter at the end of each epoch or after a certain number of steps
        if accelerator.is_main_process:
            # Save the LoRA adapter weights
            unet = accelerator.unwrap_model(unet)
            unet.save_pretrained(os.path.join(args.output_dir, f"epoch_{epoch+1}")) # Saves as diffusers format
            # If you want to save just the lora_ring.bin file (PEFT format for just the adapter)
            torch.save(unet.peft_config, os.path.join(args.output_dir, "lora_ring_config.json"))
            torch.save(unet.state_dict(), os.path.join(args.output_dir, "lora_ring.bin"))

            # Log to Weights & Biases or MLflow
            current_logs = {"epoch_loss": train_loss / len(train_dataloader)}
            accelerator.log(current_logs, step=global_step)

    accelerator.end_training()

    # Final save of the adapter
    if accelerator.is_main_process:
        logger.info(f"Saving final adapter to {args.output_dir}")
        unet = accelerator.unwrap_model(unet)
        # You can save in a diffusers compatible format which includes the config and weights
        unet.save_pretrained(args.output_dir)
        # Or specifically just the adapter weights in PEFT format
        torch.save(unet.state_dict(), os.path.join(args.output_dir, "lora_ring.bin"))

        # Optional: Upload to Hugging Face Hub
        # if args.push_to_hub:
        #     repo_id = create_repo(repo_id=args.output_dir, exist_ok=True).repo_id
        #     upload_folder(
        #         repo_id=repo_id,
        #         folder_path=args.output_dir,
        #         commit_message="End of training",
        #         ignore_patterns=["*.pt", "*.bin", "*.json"], # Adjust if you want to upload .bin too
        #     )

if __name__ == "__main__":
    main()

usage: colab_kernel_launcher.py [-h]
                                [--pretrained_model_name_or_path PRETRAINED_MODEL_NAME_OR_PATH]
                                [--output_dir OUTPUT_DIR] --train_data_dir
                                TRAIN_DATA_DIR [--resolution RESOLUTION]
                                [--center_crop] [--random_flip]
                                [--train_batch_size TRAIN_BATCH_SIZE]
                                [--num_train_epochs NUM_TRAIN_EPOCHS]
                                [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
                                [--learning_rate LEARNING_RATE]
                                [--lr_scheduler_type LR_SCHEDULER_TYPE]
                                [--lr_warmup_steps LR_WARMUP_STEPS]
                                [--seed SEED]
                                [--dataloader_num_workers DATALOADER_NUM_WORKERS]
                                [--rank RANK] [--logging_dir LOGGING_DIR]
                 

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [12]:
!accelerate launch /content/sample_data/train_lora.py \
    --pretrained_model_name_or_path "runwayml/stable-diffusion-v1-5" \
    --train_data_dir "/content/sample_data/ring_images" \
    --output_dir "./lora_ring_output" \
    --resolution 512 \
    --train_batch_size 1 \
    --num_train_epochs 3 \
    --rank 4 \
    --learning_rate 1e-4 \
    --report_to "wandb"

ipex flag is deprecated, will be removed in Accelerate v1.10. From 2.7.0, PyTorch has all needed optimizations for Intel CPU and XPU.
The following values were not passed to `accelerate launch` and had defaults used instead:
	`--num_processes` was set to a value of `0`
	`--num_machines` was set to a value of `1`
	`--mixed_precision` was set to a value of `'no'`
	`--dynamo_backend` was set to a value of `'no'`
2025-07-26 21:46:05.826842: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753566365.854279    6438 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753566365.862093    6438 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Curr

In [13]:
repo_name = "Pumpkinpie25/vit-finetuned-lora- chain101"
lora_model.push_to_hub(repo_name)

NameError: name 'lora_model' is not defined

In [20]:
# This would be in a *new* Colab cell, after your training has completed

from huggingface_hub import create_repo, upload_folder, login
import os
from accelerate import Accelerator # Import Accelerator again

# Ensure you're logged in (run this if you haven't already in the session)
# login()

# Re-initialize accelerator for checking main process (optional, but good practice)
# You might not need this if you're certain you're running on a single process.
# If you don't re-initialize, remove the `if accelerator.is_main_process:` check.
accelerator = Accelerator() # Initialize it here!

# Define your output directory (must match what you used in training)
output_dir = "./lora_ring_output" # Make sure this path is correct

if accelerator.is_main_process: # Still good to check if running accelerate launch
    repo_id = "Pumpkinpie25/finetuned-lora-chain101" # <--- IMPORTANT: Replace with YOUR Hugging Face username
    print(f"Creating Hugging Face Hub repository: {repo_id}")
    create_repo(repo_id=repo_id, private=False, exist_ok=True)

    print(f"Uploading adapter from {output_dir} to {repo_id}")
    upload_folder(
        repo_id=repo_id,
        folder_path=output_dir,
        commit_message="LoRA adapter fine-tuned for ring style",
    )
    print(f"LoRA adapter pushed to https://huggingface.co/{repo_id}")
else:
    print("Not main process, skipping Hugging Face Hub upload.")

Creating Hugging Face Hub repository: Pumpkinpie25/finetuned-lora-chain101
Uploading adapter from ./lora_ring_output to Pumpkinpie25/finetuned-lora-chain101


Uploading...:   0%|          | 0.00/3.45G [00:00<?, ?B/s]

LoRA adapter pushed to https://huggingface.co/Pumpkinpie25/finetuned-lora-chain101


In [19]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!python /content/sample_data/test_lora.py

2025-07-26 22:48:54.021036: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753570134.486935   21374 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753570134.615498   21374 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading base model: runwayml/stable-diffusion-v1-5
model_index.json: 100% 541/541 [00:00<00:00, 1.42MB/s]
Fetching 15 files:   0% 0/15 [00:00<?, ?it/s]
model.safetensors:   0% 0.00/1.22G [00:00<?, ?B/s][A

model.safetensors:   0% 0.00/492M [00:00<?, ?B/s][A[A


preprocessor_config.json: 100% 342/342 [00:00<00:00, 1.53MB/s]
Fetching 15 files:   7% 1/15 [00:00<00:03,  3.93it/s]


config.json: 4.72kB [00:00, 456kB/s]



special_token