## Tutorial : Running minimal inference examples with diffuser.

For this tutorial, we will use my pre-trained lora embedding that is pretrained on pop-arts, illustrations and pixar footages.

To get started install this package with:

```bash
pip install git+https://github.com/cloneofsimo/lora.git
```


In [1]:
from diffusers import StableDiffusionPipeline, EulerAncestralDiscreteScheduler
import torch
import os
# os.environ["DISABLE_TELEMETRY"] = 'YES'
model_id = "runwayml/stable-diffusion-v1-5"

pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16,local_files_only=True,revision='39593d5650112b4cc580433f6b0435385882d819').to(
    "cuda:4"
)
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

prompt = "style of <s1><s2>, baby lion"
torch.manual_seed(0)
# image = pipe(prompt, num_inference_steps=50, guidance_scale=7).images[0]

# image  # nice. diffusers are cool.

  from .autonotebook import tqdm as notebook_tqdm
2023-05-27 08:35:30.987606: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


<torch._C.Generator at 0x7f9252d476d0>

: 

In [None]:
! load_ext autoreload
! autoreload 2
import json
import math
from itertools import groupby
from typing import Callable, Dict, List, Optional, Set, Tuple, Type, Union

import numpy as np
import PIL
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
sys.path.append('../')
from lora_diffusion import LoraInjectedConv2d, LoraInjectedLinear, patch_pipe, tune_lora_scale
from lora_diffusion.lora import _find_modules, UNET_CROSSATTN_TARGET_REPLACE
from visual import visualize_images


In [None]:
from lora_diffusion import LoraInjectedLinear
layer = LoraInjectedLinear(10,20)
layer.get_reg_loss(reg_vector=torch.ones(10))

In [None]:
# change a PIL.Image to  tensor
def get_masked_identifier_latents(text_encoder,token_ids,identifier_indice,class_len,dtype=torch.float16,replace_identifier = True):
    hidden_states = text_encoder.text_model.embeddings(input_ids=token_ids.to(text_encoder.device))
    bs = token_ids.size(0)
    identifier_indice = identifier_indice
    class_token_len = class_len
    causal_attention_mask = text_encoder.text_model._build_causal_attention_mask(bs,77,dtype)
    causal_attention_mask[:,:,identifier_indice, :max(identifier_indice,1)] = torch.finfo(dtype).min
    causal_attention_mask[:,:,identifier_indice+class_token_len+1:,identifier_indice] = torch.finfo(dtype).min
    encoder_outputs = text_encoder.text_model.encoder(
    inputs_embeds=hidden_states,
    causal_attention_mask=causal_attention_mask.to(text_encoder.device),
    )

    last_hidden_state = encoder_outputs[0]
    encoder_hidden_states = text_encoder.text_model.final_layer_norm(last_hidden_state)
    # encoder_hidden_states[:,identifier_indice] = 0.5 * encoder_hidden_states[:,identifier_indice]
    return encoder_hidden_states



In [None]:
bs  = 2
prompt = ['a dog in the beach'] * bs
tokens = pipe.tokenizer(prompt,return_tensors='pt',padding='max_length',truncation=False,max_length=77)
embed = pipe.text_encoder(tokens['input_ids'].to(pipe.text_encoder.device))[0]
no_mask_images = pipe(prompt_embeds= embed, num_inference_steps=50, guidance_scale=7).images
embed = get_masked_identifier_latents(pipe.text_encoder,tokens['input_ids'],1,2)
mask_images = pipe(prompt_embeds= embed, num_inference_steps=50, guidance_scale=7).images
visualize_images(no_mask_images+mask_images,prompt=prompt[0],outpath='figure/',nrow=2,type = 'mask')

In [None]:
from safetensors import safe_open
import copy
lora_ckpts = ['../output_dog_crossOnly_tR0.01/lora_weight_e12_s2000.safetensors']
prompts = ['photo of a dog','photo of a <krk> dog', 'a <krk> dog at a beach with a view of the seashore']
bs = 8
pipe_copy = copy.deepcopy(pipe)
torch.manual_seed(0)
patch_pipe(
    pipe_copy,
    lora_ckpts[0],
    patch_text=False,
    patch_ti=True,
    patch_unet=True,
    filter_crossattn_str = 'cross'
)
# pipe.unet
tune_lora_scale(pipe_copy.unet, 0)
# tune_lora_scale(pipe_copy.text_encoder, 1)
for prompt in prompts:
    prompt = [prompt]*bs
    img = pipe_copy(prompt = prompt, num_inference_steps=50, guidance_scale=6).images
    visualize_images(img,outpath='figure/', nrow=4, save=False)

In [None]:
from safetensors import safe_open
import copy
_lora_path = ['../output_dog_cross+self/lora_weight_e12_s2000.safetensors']
lora_ckpts = ['../output_dog_cross+self/lora_weight_e12_s2000.safetensors']
prompts = ['photo of a dog','photo of a <krk> dog', 'a <krk> dog at a beach with a view of the seashore']
bs = 8
del pipe_copy
pipe_copy = copy.deepcopy(
    pipe)
torch.manual_seed(0)
patch_pipe(
    pipe_copy,
    lora_ckpts[0],
    patch_text=False,
    patch_ti=True,
    patch_unet=True,
    filter_crossattn_str = 'cross+self'
)
# pipe.unet
tune_lora_scale(pipe_copy.unet, 1)
# tune_lora_scale(pipe_copy.text_encoder, 1)
for prompt in prompts:
    prompt = [prompt]*bs
    img = pipe_copy(prompt = prompt, num_inference_steps=50, guidance_scale=6).images
    visualize_images(img,outpath='figure/', nrow=4, save=False)

In [None]:
from safetensors import safe_open
import copy
lora_ckpts = ['../output_dog_cross+self_tR0.001_nR0.001/lora_weight_e12_s2000.safetensors']
prompts = ['photo of a dog','photo of a <krk> dog', 'a <krk> dog at a beach with a view of the seashore']
bs = 8
pipe_copy = copy.deepcopy(pipe)
torch.manual_seed(0)
patch_pipe(
    pipe_copy,
    lora_ckpts[0],
    patch_text=False,
    patch_ti=True,
    patch_unet=True,
    filter_crossattn_str = 'cross+self'
)
# pipe.unet
tune_lora_scale(pipe_copy.unet, 1)
# tune_lora_scale(pipe_copy.text_encoder, 1)
for prompt in prompts:
    prompt = [prompt]*bs
    img = pipe_copy(prompt = prompt, num_inference_steps=50, guidance_scale=6).images
    visualize_images(img,outpath='figure/', nrow=4, save=False)

In [None]:
lora_ckpts = ['../output_dog/lora_weight_e12_s2000.safetensors']
prompts = ['photo of a dog']
bs = 4
torch.manual_seed(0)
for lora_ckpt in lora_ckpts: 
    patch_pipe(
        pipe,
        lora_ckpt,
        patch_text=True,
        patch_ti=True,
        patch_unet=True,
    )
    tune_lora_scale(pipe.unet, 0.1)
    tune_lora_scale(pipe.text_encoder, 1)
    guidance_scale = 7

    with torch.no_grad():
        for prompt in prompts:
            prompt = [prompt] * bs
            tokens = pipe.tokenizer(prompt,return_tensors='pt',padding='max_length',truncation=False,max_length=77)
            un_tokens = pipe.tokenizer([''] * bs,return_tensors='pt',padding='max_length',truncation=False,max_length=77)
            embed = pipe.text_encoder(tokens['input_ids'].to(pipe.text_encoder.device))[0]
            un_embed = pipe.text_encoder(un_tokens['input_ids'].to(pipe.text_encoder.device))[0]
            c_embed = torch.concat([un_embed,embed],dim=0)
            # embed = get_masked_identifier_latents(pipe.text_encoder,tokens['input_ids'],5,1)
            pipe.scheduler.set_timesteps(50)

            sample_size = pipe.unet.config.sample_size
            noise = torch.randn((bs , 4, sample_size, sample_size),dtype=torch.float16).to("cuda")
            latents = noise
            for t in pipe.scheduler.timesteps:
                    latent_model_input = torch.cat([latents] * 2)
                    latent_model_input = pipe.scheduler.scale_model_input(latent_model_input, t)
                    noise_pred = pipe.unet(latent_model_input, t,encoder_hidden_states=c_embed).sample
                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                    prev_noisy_sample = pipe.scheduler.step(noise_pred, t, latents).prev_sample
                    latents = prev_noisy_sample
            image = pipe.decode_latents(latents)
            image = pipe.numpy_to_pil(image)
            image

In [None]:
lora_ckpts = ['../output_dog/lora_weight_e12_s2000.safetensors']
prompts = ['photo of a <krk> dog','photo of a <krk> dog swimming in a pool']
bs = 2
for lora_ckpt in lora_ckpts: 
    patch_pipe(
        pipe,
        lora_ckpt,
        patch_text=True,
        patch_ti=True,
        patch_unet=True,
    )
    for prompt in prompts:
        scale_type = 'all'
        images = visual_unet_scales(pipe,prompt,type= scale_type,seed=1,batch_size=bs,scales=[1])
        outpath = os.path.join(*lora_ckpt.split('/')[:2],'grid_samples')
        visualize_images(images,prompt,outpath,nrow=bs, save = False, type = scale_type)

In [None]:
import copy
lora_ckpts = ['../output_dog_selfOnly/lora_weight_e12_s2000.safetensors']
prompts = ['photo of a dog','photo of a <krk> dog', 'a <krk> dog at a beach with a view of the seashore']
bs = 8
pipe_copy = copy.deepcopy(pipe)
torch.manual_seed(0)
patch_pipe(
    pipe_copy,
    lora_ckpts[0],
    patch_text=False,
    patch_ti=True,
    patch_unet=True,
    filter_crossattn_str = 'self'
)
def filter_unet_lora_weights(unet, target_replace_module=UNET_CROSSATTN_TARGET_REPLACE):
    """
    filter out lora weights from unet
    returns :
        {"cross_project_loras": lora_params_name,
        "other_loras": lora_params_name,
        }
    """

    _child_modules = [_child_module for _,_,_child_module in 
                      _find_modules(unet, target_replace_module, search_class=[LoraInjectedLinear, LoraInjectedConv2d])]
    _child_cross_project_modules = [_child_module for _,_,_child_module in
                                    _find_modules(unet, target_replace_module, search_class=[LoraInjectedLinear],filter_crossattn_str='cross')]

    _child_other_lora_modules = list(set(_child_modules) - set(_child_cross_project_modules))

    filter_result = {
        "cross_project_loras": _child_cross_project_modules,
        "other_loras": _child_other_lora_modules,
    }
    return filter_result
filter_unet_lora_weights(pipe_copy.unet)

Nice. Let's try another example:


In [None]:
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(
    "cuda"
)

prompt = "superman, style of <s1><s2>"
torch.manual_seed(1)
image = pipe(prompt, num_inference_steps=50, guidance_scale=4).images[0]

image


In [None]:
patch_pipe(
    pipe,
    "../example_loras/lora_popart.safetensors",
    patch_text=True,
    patch_ti=True,
    patch_unet=True,
)
torch.manual_seed(1)
tune_lora_scale(pipe.unet, 1.0)
tune_lora_scale(pipe.text_encoder, 1.0)
image = pipe(prompt, num_inference_steps=50, guidance_scale=4).images[0]
image


That is good pop-art style, but we might get a better result with lower $\alpha$ for both text encoder and unet.


In [None]:
torch.manual_seed(1)
tune_lora_scale(pipe.unet, 0.5)
tune_lora_scale(pipe.text_encoder, 0.5)

image = pipe(prompt, num_inference_steps=50, guidance_scale=4.0).images[0]
image.save("../contents/pop_art.jpg")
image


# Appendix : To make stuff on the readme


In [None]:
prompt = "baby lion in style of <s1><s2>"

patch_pipe(
    pipe,
    "../example_loras/lora_disney.safetensors",
    patch_text=True,
    patch_ti=True,
    patch_unet=True,
)
torch.manual_seed(6)
tune_lora_scale(pipe.unet, 0.5)
tune_lora_scale(pipe.text_encoder, 0.5)
image = pipe(prompt, num_inference_steps=50, guidance_scale=5).images[0]
image.save("../contents/disney_lora.jpg")
image


#


In [None]:

patch_pipe(
    pipe,
    "../example_loras/lora_krk.safetensors",
    patch_text=True,
    patch_ti=True,
    patch_unet=True,
)

example_prompts = [
    "painting of <TOK>, a starry night, style of vincent van gogh",
    "portrait of <TOK> by mario testino 1950, 1950s style, hair tied in a bun, taken in 1950, detailed face of <TOK>, sony a7r",
    "photof of <TOK>, 50mm, sharp, muscular, detailed realistic face, hyper realistic, perfect face, intricate, natural light, <TOK> underwater photoshoot,collarbones, skin indentation, Alphonse Mucha, Greg Rutkowski",
    "a photo of <TOK> in advanced organic armor, biological filigree, detailed symmetric face, flowing hair, neon details, intricate, elegant, highly detailed, digital painting, artstation, concept art, smooth, sharp focus, octane, art by Krenz Cushart , Artem Demura, Alphonse Mucha, digital cgi art 8K HDR by Yuanyuan Wang photorealistic",
    "a photo of <TOK> on the beach, small waves, detailed symmetric face, beautiful composition",
    "a photo of <TOK> rainbow background, wlop, dan mumford, artgerm, liam brazier, peter mohrbacher, jia zhangke, 8 k, raw, featured in artstation, octane render, cinematic, elegant, intricate, 8 k",
    "photo of Summoner <TOK> with a cute water elemental, fantasy illustration, detailed face, intricate, elegant, highly detailed, digital painting, artstation, concept art, wallpaper, smooth, sharp focus, illustration, art by artgerm and greg rutkowski",
    "<TOK>, cyberpunk 2077, 4K, 3d render in unreal engine",
    "a pencil sketch of <TOK>",
    "a minecraft render of <TOK>",
    "young woman <TOK>, eden, intense eyes, tears running down, crying, vaporwave aesthetic, synthwave, colorful, psychedelic, crown, long gown, flowers, bees, butterflies, ribbons, ornate, intricate, digital painting, artstation, concept art, smooth, sharp focus, illustration of <wday>, art by artgerm and greg rutkowski and alphonse mucha",
    "<TOK> in a construction outfit",
]

outs = []
tune_lora_scale(pipe.unet, 0.5)
tune_lora_scale(pipe.text_encoder, 0.5)
for idx, prompt in enumerate(example_prompts):
    prompt = prompt.replace("<TOK>", "<s1><s2>")
    torch.manual_seed(idx)
    image = pipe(prompt, num_inference_steps=50, guidance_scale=6).images[0]
    outs.append(image)


In [None]:
from lora_diffusion import image_grid

imgs = image_grid(outs, 3, 4)
imgs.save("../contents/lora_pti_example.jpg")
imgs

## Using extended LoRA


In [None]:

from lora_diffusion import UNET_EXTENDED_TARGET_REPLACE

patch_pipe(
    pipe,
    "../example_loras/modern_disney_svd.safetensors",
    patch_text=True,
    patch_ti=True,
    patch_unet=True,
    unet_target_replace_module=UNET_EXTENDED_TARGET_REPLACE
)

In [None]:
prompt = "modern disney style, cute baby lion"

SC = 2.5

torch.manual_seed(0)
tune_lora_scale(pipe.unet, SC)
tune_lora_scale(pipe.text_encoder, SC)

img_ori = pipe(
    prompt,
    num_inference_steps=50,
    guidance_scale=7.5,
    height=640,
    width=512,
).images[0]

In [None]:
import torch.optim as optim
import torch.nn as nn
x = nn.parameter.Parameter(torch.tensor([0.0]),requires_grad=True)
optimizer = optim.AdamW([{"params": x, "lr":0.1},{"params": [], "lr":0.1}])

In [None]:
b = x*2
b.backward()
optimizer.step()

In [None]:
from diffusers import DDPMScheduler
scheduler = DDPMScheduler.from_config("runwayml/stable-diffusion-v1-5",subfolder = "scheduler")

In [None]:
print(scheduler.alphas_cumprod[0])

In [12]:
import torch
result = (torch.arange(100) != 1) * (torch.arange(100) != 2)

In [13]:
result

tensor([ True, False, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True])