In [None]:
import model_loader
import pipeline
from PIL import Image
from transformers import CLIPTokenizer
import torch

DEVICE = "cpu"

ALLOW_CUDA = True
ALLOW_MPS = False # Metal API for MacOS

if torch.cuda.is_available() and ALLOW_CUDA:
    DEVICE = "cuda"
elif (torch.has_mps or torch.backends.mps.is_available()) and ALLOW_MPS:
    DEVICE = "mps"
print(f"Using device {DEVICE}")

tokenizer = CLIPTokenizer("../data/tokenizer_vocab.json", merges_file="../data/tokenizer_merges.txt")
model_file = "../data/v1-5-pruned-emaonly.ckpt"
models = model_loader.preload_models_from_standard_weights(model_file, DEVICE)

## TEXT TO IMAGE

#"A young woman by the window in a coffee shop, anime style"
#"A cat stretching on the floor, highly detailed, ultra sharp, cinematic, 8k resolution"
#"Historical monochrome photo of a world war two battlefield, highly detailed"
prompt = "A green field under a blue sky, highly detailed, ultra sharp, cinematic, 8k resolution"
# aka: negative prompt
uncond_prompt = ""
do_cfg = True
# [1,14] how much attention to pay to prompt (14 max)
cfg_scale = 7

## IMAGE TO IMAGE

input_image = None
#image_path = "../images/WindowsXP512.jpg"
#input_image = Image.open(image_path)
strength = 0.6

sampler = "ddpm"
num_inference_steps = 50
seed = 42

output_image = pipeline.generate(
    prompt=prompt,
    uncond_prompt=uncond_prompt,
    input_image=input_image,
    strength=strength,
    do_cfg=do_cfg,
    cfg_scale=cfg_scale,
    sampler_name=sampler,
    n_inference_steps=num_inference_steps,
    seed=seed,
    models=models,
    device=DEVICE,
    idle_device="cpu",
    tokenizer=tokenizer
)

Image.fromarray(output_image)