## A notebook that demonstrates the diffusion image generation process step by step, outputting images and a GIF showing the process
### TLDR Process --> this one's easy:
1. Get onto Google Colab with a GPU runtime (L4 will do)
2. Install the diffusers library (everything else should be available)
3. Set up a HF_TOKEN env variable to auth to HuggingFace
4. Run all the cells to have your image generated using the default encoders, tokenizers, unet and scheduler
5. Try out some other autoencoders, unets and schedulers from [ðŸ¤— Hugging Face](https://huggingface.co/models)

In [None]:
!pip install diffusers

In [46]:
from diffusers import AutoencoderKL, PNDMScheduler, UNet2DConditionModel
from transformers import CLIPTextModel, CLIPTokenizer
import torch
from torch import autocast
from tqdm.auto import tqdm
from PIL import Image
import os

In [47]:
# Auth to Hugging Face manually, otherwise just set a HF_TOKEN env variable to avoid this
# from huggingface_hub import login
# login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [48]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [49]:
# variational autoencoder model used to decode the latents back into image space
vae = AutoencoderKL.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="vae").to(device)

# tokenizer and text encoder to tokenize and encode prompts
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")

# UNet model uesd to transform images into latent space
unet = UNet2DConditionModel.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet").to(device)

# scheduler for handling noise manipulation
scheduler = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)

In [55]:
# define prompt and basic parameters
prompt = ["An elephant drinking from a lake"]
height, width = 1024, 1024
num_inference_steps = 30
guidance_scale = 7.5
batch_size = 1

In [56]:
def get_text_embeddings(prompt, max_length=None):
  max_length = max_length if max_length is not None else tokenizer.model_max_length
  text_input = tokenizer(
      prompt,
      padding="max_length",
      max_length=max_length,
      truncation=True,
      return_tensors="pt",
  )
  
  with torch.no_grad():
    text_embeddings = text_encoder(text_input.input_ids)[0]
  
  max_length = text_input.input_ids.shape[-1]
  return text_embeddings, max_length

prompt_embeddings, max_length = get_text_embeddings(prompt)

# unconditional input used in classifier-free guidance method
uncond_embeddings, _ = get_text_embeddings([""], max_length)

text_embeddings = torch.cat([uncond_embeddings, prompt_embeddings]).to(device)

In [57]:
# prepare scheduler
def set_timesteps(scheduler, num_inference_steps):
    scheduler.set_timesteps(num_inference_steps)
    scheduler.timesteps = scheduler.timesteps.to(torch.float32)

set_timesteps(scheduler,num_inference_steps)

# define latents in the desired dimensionality and shape
latents = torch.randn(
  (batch_size, unet.in_channels, height // 8, width // 8),
)
latents = latents.to(device)
latents = latents * scheduler.init_noise_sigma

  (batch_size, unet.in_channels, height // 8, width // 8),


In [58]:
# handle transformations between the standard image space and the latent space
def latent_to_image_tensor(latents):
  # 0.18215 is a scale factor outlined in the original "High-Resolution Image Synthesis with Latent Diffusion Models" paper (https://arxiv.org/pdf/2112.10752)
  latents = 1/0.18215 * latents
  with torch.no_grad():
      image = vae.decode(latents).sample
  return image

def tensor_to_image(image):
  image = (image / 2 + 0.5).clamp(0, 1)  # take values from [-1,1] to [0,1]
  image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
  image = (image * 255).round().astype("uint8")
  pil_image = Image.fromarray(image[0])  # index 0 to remove batch dimension
  return pil_image

In [59]:
from tqdm import tqdm
import torch

for i, t in tqdm(enumerate(scheduler.timesteps), total=len(scheduler.timesteps)):
    # duplicate the latents tensor as required for classifier-free guidance
    latent_input = torch.cat([latents] * 2)

    # only performing inference therefore no gradient computation required
    with torch.no_grad():
        noise_pred = unet(latent_input, t, encoder_hidden_states=text_embeddings).sample

    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)

    # the core algorithm: good to think about it like a simple equation: n=u+g*(t-u)
    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

    # convert timestemp to a long as required by the PNDM scheduler
    t_int = t.long() if torch.is_tensor(t) else int(t)

    # use the scheduler to update the latents based on the predicted noise
    latents = scheduler.step(noise_pred, t_int, latents).prev_sample

    # Convert the latents to an image tensor and then to an image that can be saved
    image_tensor = latent_to_image_tensor(latents)
    image = tensor_to_image(image_tensor)
    image.save(f"image_{i+1:02d}.png")

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 39/39 [00:17<00:00,  2.23it/s]


In [54]:
# import all images and make an animation
images = [f for f in os.listdir('.') if f.startswith('image_')]
images.sort()
image_list = []

for image in images:
  path = os.path.join('.', image)
  image = Image.open(path)
  image_list.append(image)

image_list[0].save("elephant_animation.gif", save_all=True, append_images=image_list[1:], duration=100, loop=0)