# Experiments with Text-To-Video Zero Pipeline

In [None]:
import torch
import imageio
from diffusers import TextToVideoZeroPipeline, ControlNetModel, StableDiffusionControlNetPipeline, TextToVideoZeroPipeline
from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
from huggingface_hub import hf_hub_download
from PIL import Image

In [None]:
import jax
jax.local_devices()

### Text-To-Video

In [None]:
model_id = "runwayml/stable-diffusion-v1-5" # TODO: experiment with pretrained custom models on hugging face
pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16)

prompt = "A panda is playing guitar on times square"
result = pipe(prompt=prompt).images
result = [(r * 255).astype("uint8") for r in result]
imageio.mimsave("video.mp4", result, fps=4)

### Text-To-Video with Pose Control

In [None]:
model_id = "runwayml/stable-diffusion-v1-5" # base model
repo_id = "PAIR/Text2Video-Zero"
video_path = "__assets__/poses_skeleton_gifs/dance1_corr.mp4" # pose video

reader = imageio.get_reader(video_path, "ffmpeg")
frame_count = 8
pose_images = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]

controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id, controlnet=controlnet)

# Set the attention processor
pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))

# fix latents for all frames
latents = torch.randn((1, 4, 64, 64)).repeat(len(pose_images), 1, 1, 1)

prompt = "Darth Vader dancing in a desert"
result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
imageio.mimsave("video.mp4", result, fps=4)


### Text-To-Video with Safetensors