# Text To Video Diffusion

In this notebook, we take a look at Text to Video Diffusion.

### Install and Import required packages

In [None]:
%pip install stablefused ipython

In [None]:
import numpy as np
import torch

from IPython.display import display, Video
from diffusers.schedulers import DPMSolverMultistepScheduler

from stablefused import TextToVideoDiffusion
from stablefused.utils import pil_to_video, image_grid

### Initialize model and parameters

We use Cerspense's Zeroscope v2 to initialize our Text To Video Diffusion model. Play around with different prompts and see what you get! You can comment out the seed part if you want to generate new random images each time you run the notebook.

We enable slicing and tiling of the VAE to reduce memory required for decoding process from latent space to image space.

In [None]:
# model_id = "damo-vilab/text-to-video-ms-1.7b"
model_id = "cerspense/zeroscope_v2_576w"

# model = TextToVideoDiffusion(model_id = model_id, torch_dtype = torch.float16, variant = "fp16")
model = TextToVideoDiffusion(model_id=model_id, torch_dtype=torch.float16)

model.scheduler = DPMSolverMultistepScheduler.from_config(model.scheduler.config)
model.enable_slicing()
model.enable_tiling()

In [None]:
prompt = "An astronaut floating in space, interstellar, black background with stars, photorealistic, high quality, 8k"
negative_prompt = "multiple people, cartoon, unrealistic, blur, boring background, deformed, disfigured, low resolution, unattractive, nsfw"
num_inference_steps = 15
video_frames = 24
seed = 420

torch.manual_seed(seed)
np.random.seed(seed)

In [None]:
frames = model(
    prompt=prompt,
    negative_prompt=negative_prompt,
    video_height=320,
    video_width=576,
    video_frames=video_frames,
    num_inference_steps=num_inference_steps,
    guidance_scale=8.0,
)

In [None]:
filename = "interstellar-astronaut.mp4"
pil_to_video(frames[0], filename, fps=8)

In [None]:
display(Video(filename, embed=True))

In [None]:
prompt = "A mighty pirate ship sailing through the sea, unpleasant, thundering roar, dark night, starry night, high quality, photorealistic, 8k"
seed = 42

torch.manual_seed(seed)
np.random.seed(seed)

In [None]:
frames = model(
    prompt=[prompt] * 2,
    video_height=320,
    video_width=576,
    video_frames=video_frames,
    num_inference_steps=num_inference_steps,
    guidance_scale=12.0,
)

In [None]:
# Tile the frames of the two videos one above the other.
frames_concatenated = []
for images in zip(*frames):
    frames_concatenated.append(image_grid(images, rows=2, cols=1))

In [None]:
filename = "mighty-ship.mp4"
pil_to_video(frames_concatenated, filename, fps=8)

In [None]:
display(Video(filename, embed=True))