In [None]:
%cd /content
!git clone -b dev https://github.com/camenduru/DiffSynth-Studio
%cd /content/DiffSynth-Studio

!pip install -q einops transformers controlnet-aux==0.0.7 sentencepiece imageio imageio-ffmpeg

!sudo apt -y install -qq aria2
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/clip_text_encoder/pytorch_model.bin -d /content/DiffSynth-Studio/models/HunyuanDiT/t2i/clip_text_encoder -o pytorch_model.bin
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/mt5/pytorch_model.bin -d /content/DiffSynth-Studio/models/HunyuanDiT/t2i/mt5 -o pytorch_model.bin
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/model/pytorch_model_ema.pt -d /content/DiffSynth-Studio/models/HunyuanDiT/t2i/model -o pytorch_model_ema.pt
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/resolve/main/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin -d /content/DiffSynth-Studio/models/HunyuanDiT/t2i/sdxl-vae-fp16-fix -o diffusion_pytorch_model.bin
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/resolve/main/svd_xt.safetensors -d /content/DiffSynth-Studio/models/stable_video_diffusion -o svd_xt.safetensors
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1/resolve/main/model.fp16.safetensors -d /content/DiffSynth-Studio/models/stable_video_diffusion -o model.fp16.safetensors

In [None]:
%cd /content/DiffSynth-Studio

from diffsynth import save_video, ModelManager, SVDVideoPipeline, HunyuanDiTImagePipeline
from diffsynth import ModelManager
import torch, os, imageio
import numpy as np
from PIL import Image

def read_video(file_name):
    reader = imageio.get_reader(file_name)
    video = []
    for frame in reader:
        frame = np.array(frame).copy()
        video.append(frame)
    reader.close()
    return video

def generate_image():
    # Load models
    os.environ["TOKENIZERS_PARALLELISM"] = "True"
    model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
    model_manager.load_models([
        "models/HunyuanDiT/t2i/clip_text_encoder/pytorch_model.bin",
        "models/HunyuanDiT/t2i/mt5/pytorch_model.bin",
        "models/HunyuanDiT/t2i/model/pytorch_model_ema.pt",
        "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin"
    ])
    pipe = HunyuanDiTImagePipeline.from_model_manager(model_manager)

    # Generate an image
    torch.manual_seed(0)
    image = pipe(
        prompt="bonfire, on the stone",
        negative_prompt="错误的眼睛，糟糕的人脸，毁容，糟糕的艺术，变形，多余的肢体，模糊的颜色，模糊，重复，病态，残缺，",
        num_inference_steps=50, height=1024, width=1024,
    )
    return image


def generate_video(image):
    # Load models
    model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
    model_manager.load_models([
        "models/stable_video_diffusion/svd_xt.safetensors",
        "models/stable_video_diffusion/model.fp16.safetensors"
    ])
    pipe = SVDVideoPipeline.from_model_manager(model_manager)

    # Generate a video
    torch.manual_seed(1)
    video = pipe(
        input_image=image.resize((512, 512)),
        num_frames=128, fps=30, height=512, width=512,
        motion_bucket_id=127,
        num_inference_steps=50,
        min_cfg_scale=2, max_cfg_scale=2, contrast_enhance_scale=1.2
    )
    return video


def upscale_video(image, video):
    # Load models
    model_manager = ModelManager(torch_dtype=torch.float16, device="cuda")
    model_manager.load_models([
        "models/stable_video_diffusion/svd_xt.safetensors",
        "models/stable_video_diffusion/model.fp16.safetensors",
    ])
    pipe = SVDVideoPipeline.from_model_manager(model_manager)

    pil_frames = [Image.fromarray(frame) for frame in video]
    resized_frames = [frame.resize((1024, 1024)) for frame in pil_frames]

    # Generate a video
    torch.manual_seed(2)
    video = pipe(
        input_image=image.resize((1024, 1024)),
        input_video=resized_frames,
        num_frames=128, fps=30, height=1024, width=1024,
        motion_bucket_id=127,
        num_inference_steps=25,
        min_cfg_scale=2, max_cfg_scale=2, contrast_enhance_scale=1.2
    )
    return video


# We use Hunyuan DiT to generate the first frame.
# If you want to use your own image,
# please use `image = Image.open("your_image_file.png")` to replace the following code.
image = generate_image()
image.save("image.png")
image = Image.open("/content/DiffSynth-Studio/image.png")

# Now, generate a video with resolution of 512.
video = generate_video(image)
save_video(video, "video_512.mp4", fps=30)

# Upscale the video.
# video = upscale_video(image, video)
# save_video(video, "video_1024.mp4", fps=30)

image = Image.open("/content/DiffSynth-Studio/image.png")
video = read_video("/content/DiffSynth-Studio/video_512.mp4")
video = upscale_video(image, video)
save_video(video, "video_1024.mp4", fps=30)