# Experiments with Text-To-Video Zero Pipeline

In [1]:
import torch
import imageio
from diffusers import TextToVideoZeroPipeline, ControlNetModel, StableDiffusionControlNetPipeline, TextToVideoZeroPipeline
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
from huggingface_hub import hf_hub_download
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import jax
jax.local_devices()

[TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0),
 TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0),
 TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0),
 TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0)]

### Text-To-Video

In [10]:
model_id = "tuwonga/zukki_style"
pipe = TextToVideoZeroPipeline.from_pretrained(model_id)

prompt = "A person taking a walk through the city at night"
result = pipe(prompt=prompt).images
result = [(r * 255).astype("uint8") for r in result]
imageio.mimsave("video.mp4", result, fps=4)

Downloading (…)ain/model_index.json: 100%|██████████| 548/548 [00:00<00:00, 118kB/s]
Downloading (…)rocessor_config.json: 100%|██████████| 342/342 [00:00<00:00, 156kB/s]
Fetching 15 files:   7%|▋         | 1/15 [00:00<00:02,  5.88it/s]
Downloading (…)_encoder/config.json: 100%|██████████| 612/612 [00:00<00:00, 191kB/s]
Downloading (…)cheduler_config.json: 100%|██████████| 313/313 [00:00<00:00, 27.9kB/s]

Downloading (…)_checker/config.json: 100%|██████████| 4.84k/4.84k [00:00<00:00, 990kB/s]

Downloading (…)cial_tokens_map.json: 100%|██████████| 472/472 [00:00<00:00, 43.4kB/s]
Downloading (…)tokenizer/merges.txt: 100%|██████████| 525k/525k [00:00<00:00, 7.12MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 806/806 [00:00<00:00, 364kB/s]
Downloading (…)f96/unet/config.json: 100%|██████████| 748/748 [00:00<00:00, 416kB/s]

[A

Downloading (…)bf96/vae/config.json: 100%|██████████| 581/581 [00:00<00:00, 272kB/s]
Downloading (…)tokenizer/vocab.json: 100%|██████████| 1.06M/1.06M [

### Text-To-Video with Pose Control

In [None]:
model_id = "runwayml/stable-diffusion-v1-5" # base model
video_path = "__assets__/poses_skeleton_gifs/dance1_corr.mp4" # pose video

reader = imageio.get_reader(video_path, "ffmpeg")
frame_count = 8
pose_images = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]

controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id, controlnet=controlnet)

# Set the attention processor
pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))

# fix latents for all frames
latents = torch.randn((1, 4, 64, 64)).repeat(len(pose_images), 1, 1, 1)

prompt = "Darth Vader dancing in a desert"
result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
imageio.mimsave("video.mp4", result, fps=4)

### Text-To-Video with Safetensors

In [2]:
from diffusers.pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt
from huggingface_hub import hf_hub_download

ckpt_path = hf_hub_download(repo_id="breakcore2/ligne_claire_anime_diffusion", filename="ligne_claire_anime_diffusion_v1.safetensors")

print(f"Checkpoint path: {ckpt_path}")

# !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
pipe = download_from_original_stable_diffusion_ckpt(
    checkpoint_path=ckpt_path,
    original_config_file="v1-inference.yaml",
    from_safetensors=True
)

pipe.save_pretrained("../models/ligne_claire", safe_serialization=True)

Downloading (…)usion_v1.safetensors: 100%|██████████| 2.13G/2.13G [00:12<00:00, 171MB/s] 


Checkpoint path: /home/awu/.cache/huggingface/hub/models--breakcore2--ligne_claire_anime_diffusion/snapshots/0e89c2e14030f1afdc77b208e35aaf4a597238d9/ligne_claire_anime_diffusion_v1.safetensors
global_step key not found in model


Downloading (…)lve/main/config.json: 100%|██████████| 4.52k/4.52k [00:00<00:00, 921kB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.71G/1.71G [00:06<00:00, 276MB/s] 
Some weights of the model checkpoint at openai/clip-vit-large-patch14 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.5.self_attn.k_proj.bias', 'logit_scale', 'vision_model.encoder.layers.7.self_attn.out_proj.bias', 'vision_model.encoder.layers.7.mlp.fc2.weight', 'vision_model.encoder.layers.14.layer_norm1.weight', 'vision_model.encoder.layers.17.layer_norm1.weight', 'vision_model.encoder.layers.2.mlp.fc1.weight', 'vision_model.encoder.layers.16.self_attn.q_proj.bias', 'vision_model.encoder.layers.5.mlp.fc2.weight', 'vision_model.encoder.layers.12.layer_norm1.weight', 'vision_model.encoder.layers.6.layer_norm2.bias', 'vision_model.encoder.layers.13.layer_norm1.bias', 'vision_model.encoder.layers.13.self_attn.v_proj.bias', 'vision_model.encoder.layers.18.self_attn.k_proj.weight', 'vision_

In [8]:
pipe

StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.16.0.dev0",
  "feature_extractor": [
    "transformers",
    "CLIPFeatureExtractor"
  ],
  "requires_safety_checker": true,
  "safety_checker": [
    "stable_diffusion",
    "StableDiffusionSafetyChecker"
  ],
  "scheduler": [
    "diffusers",
    "PNDMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [7]:
pipe.unet

UNet2DConditionModel(
  (conv_in): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (time_proj): Timesteps()
  (time_embedding): TimestepEmbedding(
    (linear_1): Linear(in_features=320, out_features=1280, bias=True)
    (act): SiLU()
    (linear_2): Linear(in_features=1280, out_features=1280, bias=True)
  )
  (down_blocks): ModuleList(
    (0): CrossAttnDownBlock2D(
      (attentions): ModuleList(
        (0-1): 2 x Transformer2DModel(
          (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
          (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
          (transformer_blocks): ModuleList(
            (0): BasicTransformerBlock(
              (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
              (attn1): Attention(
                (to_q): Linear(in_features=320, out_features=320, bias=False)
                (to_k): Linear(in_features=320, out_features=320, bias=False)
                (to_v): Linear(in_features=320, out_fe