In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
import imageio

from diffusers import UNetSpatioTemporalConditionModel, StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video, export_to_gif

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
path = "/data/lyq/data/stable-video-diffusion-img2vid"
unet = UNetSpatioTemporalConditionModel.from_pretrained(
    path,
    subfolder="unet",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=False,
)
pipe = StableVideoDiffusionPipeline.from_pretrained(
    path,
    unet=unet,
    low_cpu_mem_usage=False,
    torch_dtype=torch.float16, variant="fp16", local_files_only=True,
)
pipe.to("cuda:0")

Loading pipeline components...: 100%|██████████| 5/5 [00:16<00:00,  3.32s/it]


StableVideoDiffusionPipeline {
  "_class_name": "StableVideoDiffusionPipeline",
  "_diffusers_version": "0.24.0",
  "_name_or_path": "/data/lyq/data/stable-video-diffusion-img2vid",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    "transformers",
    "CLIPVisionModelWithProjection"
  ],
  "scheduler": [
    "diffusers",
    "EulerDiscreteScheduler"
  ],
  "unet": [
    "diffusers",
    "UNetSpatioTemporalConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKLTemporalDecoder"
  ]
}

In [19]:
for i, down_block in enumerate(unet.down_blocks):
    print(f"Down block {i}:")
    print(down_block)
    break

Down block 0:
CrossAttnDownBlockSpatioTemporal(
  (attentions): ModuleList(
    (0-1): 2 x TransformerSpatioTemporalModel(
      (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
      (proj_in): Linear(in_features=320, out_features=320, bias=True)
      (transformer_blocks): ModuleList(
        (0): BasicTransformerBlock(
          (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (attn1): Attention(
            (to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
            (to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
            (to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)
            (to_out): ModuleList(
              (0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)
              (1): Dropout(p=0.0, inplace=False)
            )
          )
          (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
          (attn2): Attention(
         

In [3]:
image = load_image('demo.jpg')
image = image.resize((1024, 576))

generator = torch.manual_seed(-1)
with torch.inference_mode():
    frames = pipe(image,
                num_frames=24,
                width=1024,
                height=576,
                decode_chunk_size=8, generator=generator, motion_bucket_id=127, fps=8, num_inference_steps=30).frames[0]
# export_to_video(frames, "generated.mp4", fps=7)
imageio.mimsave("generated.mp4", frames, fps=7)

100%|██████████| 30/30 [01:47<00:00,  3.59s/it]


In [3]:
def rand_log_normal(shape, loc=0., scale=1., device='cpu', dtype=torch.float32):
    """Draws samples from an lognormal distribution."""
    u = torch.rand(shape, dtype=dtype, device=device) * (1 - 2e-7) + 1e-7
    return torch.distributions.Normal(loc, scale).icdf(u).exp()

cond_sigmas = rand_log_normal(shape=[1,], loc=-3.0, scale=0.5)
# cond_sigmas = cond_sigmas[:, None, None, None, None]

cond_sigmas.shape

torch.Size([1])

In [15]:
sigmas = rand_log_normal(shape=[1,], loc=0.7, scale=1.6)
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
sigmas = sigmas[:, None, None, None, None]
timesteps = torch.Tensor(
    [0.25 * sigma.log() for sigma in sigmas])
timesteps = timesteps.expand(3)
timesteps

tensor([0.8155, 0.8155, 0.8155])