### Check device

In [None]:
!nvidia-smi

### Install packages

In [None]:
!pip install -q -U diffusers transformers accelerate

### Clear memory cache method

In [3]:
import gc
import torch

def clear_cache():
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.reset_peak_memory_stats()

### Prompts & Artist

In [4]:
artist = "Paul Cezanne"
negative_prompt = "worst quality, normal quality, low quality, low res, blurry, distortion, text, watermark, logo, banner, extra digits, cropped, jpeg artifacts, signature, username, error, sketch, duplicate, ugly, monochrome, horror, geometry, mutation, disgusting, bad anatomy, bad proportions, bad quality, deformed, disconnected limbs, out of frame, out of focus, dehydrated, disfigured, extra arms, extra limbs, extra hands, fused fingers, gross proportions, long neck, jpeg, malformed limbs, mutated, mutated hands, mutated limbs, missing arms, missing fingers, picture frame, poorly drawn hands, poorly drawn face, collage, pixel, pixelated, grainy, color aberration, amputee, autograph, bad illustration, beyond the borders, blank background, body out of frame, boring background, branding, cut off, dismembered, disproportioned, distorted, draft, duplicated features, extra fingers, extra legs, fault, flaw, grains, hazy, identifying mark, improper scale, incorrect physiology, incorrect ratio, indistinct, kitsch, low resolution, macabre, malformed, mark, misshapen, missing hands, missing legs, mistake, morbid, mutilated, off-screen, outside the picture, poorly drawn feet, printed words, render, repellent, replicate, reproduce, revolting dimensions, script, shortened, sign, split image, squint, storyboard, tiling, trimmed, unfocused, unattractive, unnatural pose, unreal engine, unsightly, written language, nsfw"

prompt = f"fish flying in the sky in {artist} style, centered composition, photorealistic, masterpiece, 4k"

### Import pipeline

In [None]:
from time import time
from diffusers import AutoPipelineForText2Image, KandinskyPipeline, MarigoldDepthPipeline, MarigoldNormalsPipeline, StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video

### Stable Diffusion inference

In [None]:
pipeline = AutoPipelineForText2Image.from_pretrained(
	"stable-diffusion-v1-5/stable-diffusion-v1-5",
 torch_dtype=torch.float16,
 variant="fp16"
).to("cuda")

start = time()
image_stable_diffusion = pipeline(
  prompt,
  negative_prompt,
  num_inference_steps=50,
  guidance_scale=8.5,
  seed=2025,
).images[0]
print(f"Time of inference is {time() - start}s")
image_stable_diffusion

### Marigold depths prediction

In [None]:
clear_cache()

pipeline_marigold = MarigoldDepthPipeline.from_pretrained(
    "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
).to("cuda")

depth = pipeline_marigold(image_stable_diffusion)

depth_spectral = pipeline_marigold.image_processor.visualize_depth(depth.prediction)
depth_spectral[0].save("image_depth_spectral.png")
depth_spectral[0]

In [None]:
depth_16bit = pipeline_marigold.image_processor.export_depth_to_16bit_png(depth.prediction)
depth_16bit[0].save("image_depth_16bit.png")
depth_16bit[0]

In [None]:
clear_cache()

pipeline_marigold_n = MarigoldNormalsPipeline.from_pretrained(
    "prs-eth/marigold-normals-lcm-v0-1", variant="fp16", torch_dtype=torch.float16
).to("cuda")

normals = pipeline_marigold_n(image_stable_diffusion)

depth_normals = pipeline_marigold_n.image_processor.visualize_normals(normals.prediction)
depth_normals[0].save("image_normals.png")
depth_normals[0]

### Kandinsky inference

In [None]:
clear_cache()

pipeline = AutoPipelineForText2Image.from_pretrained(
  "kandinsky-community/kandinsky-2-1",
  torch_dtype=torch.float16
).to("cuda")

generator = torch.Generator("cuda").manual_seed(2025)

start = time()
image_kandinsky = pipeline(
  prompt,
  negative_prompt,
  guidance_scale=8.5,
).images[0]
print(f"Time of inference is {time() - start}s")
image_kandinsky

### Stable video diffusion inference

In [None]:
clear_cache()

pipeline = StableVideoDiffusionPipeline.from_pretrained(
  "stabilityai/stable-video-diffusion-img2vid-xt",
  torch_dtype=torch.float16,
  variant="fp16"
).to("cuda")

pipeline.enable_model_cpu_offload()
pipeline.unet.enable_forward_chunking()

# Image to revive
image = image_stable_diffusion

frames = pipeline(
    image,
    decode_chunk_size=2,
    generator=torch.manual_seed(2025),
    num_frames=25,
    motion_bucket_id=180,
    noise_aug_strength=0.1
  ).frames[0]

export_to_video(frames, "output_opt.mp4", fps=7)

### Marigold example for video

In [None]:
clear_cache()

import imageio
from PIL import Image
from tqdm import tqdm

from diffusers import AutoencoderTiny, utils

device = "cuda"
path_in = "output_opt.mp4"
path_out = "output_opt_depth.gif"

pipeline_marigold.vae = AutoencoderTiny.from_pretrained(
    "madebyollin/taesd", torch_dtype=torch.float16
).to(device)

pipeline_marigold.set_progress_bar_config(disable=True)

with imageio.get_reader(path_in) as reader:
    size = reader.get_meta_data()['size']
    last_frame_latent = None
    latent_common = torch.randn(
        (1, 4, 768 * size[1] // (8 * max(size)), 768 * size[0] // (8 * max(size)))
    ).to(device=device, dtype=torch.float16)

    out = []
    for frame_id, frame in tqdm(enumerate(reader), desc="Processing Video"):
        frame = Image.fromarray(frame)
        latents = latent_common
        if last_frame_latent is not None:
            latents = 0.9 * latents + 0.1 * last_frame_latent

        marigold_depth = pipeline_marigold(
			frame, match_input_resolution=False, latents=latents, output_latent=True
        )
        last_frame_latent = marigold_depth.latent
        out.append(pipeline_marigold.image_processor.visualize_depth(marigold_depth.prediction)[0])

    utils.export_to_gif(out, path_out, fps=reader.get_meta_data()['fps'])