In [None]:
from diffusers.utils import load_image

img = load_image(
    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
)
img = img.resize((512, 512))

In [None]:
# Models are downloaded to `HF_HOME/hub/models--lllyasviel--Annotators`
from IPython.display import display
from controlnet_aux.processor import Processor, MODELS
import controlnet_aux

print(controlnet_aux.__version__)
keys = set(MODELS.keys())
processors = list(keys - {"zoe", "mediapipe_face"})
print(processors)

In [None]:
for proc_id in processors:
    print(f"Processor: {proc_id}")
    processor = Processor(proc_id)
    proc_img = processor(img, to_bytes=False)
    display(proc_img)

In [None]:
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
from diffusers import UniPCMultistepScheduler
import torch


# CompVis/stable-diffusion-v1-4, runwayml/stable-diffusion-v1-5, stabilityai/stable-diffusion-2-1
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth", torch_dtype=torch.float16)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()
# pipe.enable_xformers_memory_efficient_attention()

In [None]:
from PIL import Image, ImageDraw


# font = ImageFont.truetype(<font-file>, <font-size>)
font = ImageFont.truetype("sans-serif.ttf", 16)
# draw.text((x, y),"Sample Text",(r,g,b))



def image_grid(imgs, rows, cols):
    assert len(imgs) == rows * cols

    w, h = imgs[0].size
    grid = Image.new("RGB", size=(cols * w, rows * h))
    grid_w, grid_h = grid.size

    for i, img in enumerate(imgs):
        draw = ImageDraw.Draw(grid)
        draw.text((0, 0),"Sample Text",(255,255,255),font=font)
        grid.paste(img, box=(i % cols * w, i // cols * h))
    return grid

In [None]:
prompt = ", best quality, extremely detailed"
prompt = [t + prompt for t in ["Sandra Oh", "beyonce", "oprah", "michelle obama"]]
generator = [torch.Generator(device="cpu").manual_seed(2) for i in range(len(prompt))]

In [None]:
depth_proc = Processor("midas")
control_img = depth_proc(img, to_bytes=False)
control_img

In [None]:
output = pipe(
    prompt,
    control_img,
    negative_prompt=["monochrome, lowres, bad anatomy, worst quality, low quality"] * 4,
    num_inference_steps=20,
    generator=generator,
    width=512, height=512
)
image_grid(output.images, 2, 2)