In [1]:
%load_ext autoreload
%autoreload 2

# Basic Usage

- https://huggingface.co/docs/diffusers/main/en/using-diffusers/sdxl
- https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl
 
Pipeline Source Code: https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/stable_diffusion_xl

In [2]:
import torch
import diffusers

from src import helpers

PyTorch version: 2.0.1+cu117


## Base Model

- https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0

In [3]:
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
refiner_id = "stabilityai/stable-diffusion-xl-refiner-1.0"

pipe = diffusers.StableDiffusionXLPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    use_safetensors=True,
    variant="fp16"
    # low_cpu_mem_usage=True,
)

pipe.enable_sequential_cpu_offload()

print(pipe)
print("Device:", pipe.device)

# When using torch >= 2.0, you can improve the inference speed by 20-30% with torch.compile. 
# Simple wrap the unet with torch compile before running the pipeline:
# if torch.__version__ >= "2.0":
#     print("Using torch.compile. The first run will be slow, but subsequent runs will be faster.")
#     pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

StableDiffusionXLPipeline {
  "_class_name": "StableDiffusionXLPipeline",
  "_diffusers_version": "0.20.2",
  "_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0",
  "force_zeros_for_empty_prompt": true,
  "scheduler": [
    "diffusers",
    "EulerDiscreteScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "text_encoder_2": [
    "transformers",
    "CLIPTextModelWithProjection"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "tokenizer_2": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

Device: cpu


In [4]:
# == The prompt or prompts to guide the image generation. 
#    If not defined, one has to pass `prompt_embeds` instead.
prompt = "flock of sheep are having selfie with a grazing on grassland, himalayan background extra detailed, highly realistic, extra detailed, himalayn landscape, hyper realistic"

# == The prompt or prompts not to guide the image generation. 
#    If not defined, one has to pass `negative_prompt_embeds` instead.
#    Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
negative_prompt = "low-res, low quality, jpeg artifacts, blurry, grainy, distorted, ugly, out of frame, watermarked"

# == Other parameters
# == Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
#    `guidance_scale` is defined as `w` of equation 2. of [Imagen Paper](https://arxiv.org/pdf/2205.11487.pdf).
#    Guidance scale is enabled by setting `guidance_scale > 1`.
#    Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.
guidance_scale = 7.5 # Default: 7.5

# == The number of denoising steps.
#    More denoising steps usually lead to a higher quality image at the expense of slower inference.
num_inference_steps = 30  # Default: 50

# == The number of images to generate per prompt.
num_images_per_prompt = 6  # Default: 1

# ================================================================================================
seed = None

# == For deterministic results across runs, we create a torch.Generator using the seed value.
generator = None
if seed is not None:
    print(f"Using seed: {seed.value}")
    generator = torch.Generator(device=pipe.device).manual_seed(seed.value)

out = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    guidance_scale=guidance_scale,
    num_inference_steps=num_inference_steps,
    num_images_per_prompt=num_images_per_prompt,
    generator=generator,
)
helpers.plot(out.images)

  0%|          | 0/30 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 6.00 GiB (GPU 0; 15.69 GiB total capacity; 10.82 GiB already allocated; 286.75 MiB free; 12.42 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Refiner Model

In [None]:
refiner = diffusers.StableDiffusionXLImg2ImgPipeline.from_pretrained(
    refiner_id,
    torch_dtype=torch.float16,
    use_safetensors=True,
    variant="fp16"
    # low_cpu_mem_usage=True,
).to("cuda" if torch.cuda.is_available() else "cpu")

print(pipe)
print("Device:", pipe.device)