In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
import imageio

from diffusers import UNetSpatioTemporalConditionModel, StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video, export_to_gif

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = "/data/lyq/data/stable-video-diffusion-img2vid"
unet = UNetSpatioTemporalConditionModel.from_pretrained(
    path,
    subfolder="unet",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=False,
)
pipe = StableVideoDiffusionPipeline.from_pretrained(
    path,
    unet=unet,
    low_cpu_mem_usage=False,
    torch_dtype=torch.float16, variant="fp16", local_files_only=True,
)
pipe.to("cuda:0")

Loading pipeline components...: 100%|██████████| 5/5 [00:18<00:00,  3.77s/it]


StableVideoDiffusionPipeline {
  "_class_name": "StableVideoDiffusionPipeline",
  "_diffusers_version": "0.24.0",
  "_name_or_path": "/data/lyq/data/stable-video-diffusion-img2vid",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    "transformers",
    "CLIPVisionModelWithProjection"
  ],
  "scheduler": [
    "diffusers",
    "EulerDiscreteScheduler"
  ],
  "unet": [
    "diffusers",
    "UNetSpatioTemporalConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKLTemporalDecoder"
  ]
}

In [3]:
image = load_image('demo.jpg')
image = image.resize((1024, 576))

generator = torch.manual_seed(-1)
with torch.inference_mode():
    frames = pipe(image,
                num_frames=24,
                width=1024,
                height=576,
                decode_chunk_size=8, generator=generator, motion_bucket_id=127, fps=8, num_inference_steps=30).frames[0]
# export_to_video(frames, "generated.mp4", fps=7)
imageio.mimsave("generated.mp4", frames, fps=7)

100%|██████████| 30/30 [01:47<00:00,  3.59s/it]


In [7]:
def rand_log_normal(shape, loc=0., scale=1., device='cpu', dtype=torch.float32):
    """Draws samples from an lognormal distribution."""
    u = torch.rand(shape, dtype=dtype, device=device) * (1 - 2e-7) + 1e-7
    return torch.distributions.Normal(loc, scale).icdf(u).exp()

cond_sigmas = rand_log_normal(shape=[1,], loc=-3.0, scale=0.5)
# cond_sigmas = cond_sigmas[:, None, None, None, None]

cond_sigmas.shape

torch.Size([1])