In [3]:
import torch
from diffusers import CogVideoXImageToVideoPipeline
from diffusers.utils import export_to_video, load_image, export_to_video_with_frames

prompt = "A cat is riding a rollerblade"
image = load_image(image="/root/daneul/projects/refactored/CogVideo/Baselines/val_samples_im/cat.png")
pipe = CogVideoXImageToVideoPipeline.from_pretrained(
    "THUDM/CogVideoX-5b-I2V",
    torch_dtype=torch.bfloat16
)

pipe.enable_sequential_cpu_offload()
pipe.vae.enable_tiling()
pipe.vae.enable_slicing()

video = pipe(
    prompt=prompt,
    image=image,
    num_videos_per_prompt=1,
    num_inference_steps=50,
    num_frames=5,
    guidance_scale=6,
    generator=torch.Generator(device="cuda").manual_seed(42),
).frames[0]

export_to_video(video, "output.mp4", fps=8)
export_to_video_with_frames(video, 'output.mp4', 'output_frames', fps=8, eval_mode=True)


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.34it/s]
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 49.96it/s]it/s]
Loading pipeline components...: 100%|██████████| 5/5 [00:01<00:00,  3.83it/s]
100%|██████████| 50/50 [01:21<00:00,  1.62s/it]


'output.mp4'

In [30]:
import json
import os

from tqdm import tqdm
from PIL import Image
import torch
from diffusers import CogVideoXImageToVideoPipeline
from diffusers.utils import export_to_video, load_image, export_to_video_with_frames

In [31]:
prompt_path = "/root/daneul/projects/refactored/CogVideo/Pexels_subset_100K_fps8_flow-25-50_sample500/small/metadata.jsonl"
type_to_eval = "small" # medium, large
first_frame_path = "/root/daneul/projects/refactored/CogVideo/Pexels_subset_100K_fps8_flow-25-50_sample500/small/first_frame"
video_save_path = "/root/daneul/projects/refactored/CogVideo/Baselines/I2V_baseline/Temporal_eval"
sampling_count = 100

In [32]:
pipe = CogVideoXImageToVideoPipeline.from_pretrained(
    "THUDM/CogVideoX-5b-I2V",
    torch_dtype=torch.bfloat16
)

pipe.enable_sequential_cpu_offload()
pipe.vae.enable_tiling()
pipe.vae.enable_slicing()

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.44it/s]
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 48.88it/s]it/s]
Loading pipeline components...: 100%|██████████| 5/5 [00:01<00:00,  2.96it/s]


In [None]:
# load path jsonl
with open(prompt_path, "r") as f:
    lines = f.readlines()
    meta_list = [line for line in lines]
# make meta dict 
meta_dict = {}
meta_list = []
with open(prompt_path, 'r') as f:
    for line in f:
        try:
            meta_list.append(json.loads(line))
        except:
            print('Error in loading json')
meta_dict = {}
for meta in meta_list:
    vid_id = str(meta['video_latent_path'].split('/')[-1].split('.')[0])
    meta_dict[vid_id] = meta
# get input image list
image_list = os.listdir(first_frame_path)

for i in tqdm(range(sampling_count)):
    vid_id = str(image_list[i].split(".")[0])
    input_image = os.path.join(first_frame_path, image_list[i])
    prompt = meta_dict[vid_id]["prompt"]
    input_image = Image.open(input_image)
    # pass through I2V
    video = pipe(
        prompt=prompt,
        image=input_image,
        num_videos_per_prompt=1,
        num_inference_steps=50,
        num_frames=5,
        guidance_scale=6,
        generator=torch.Generator(device="cuda").manual_seed(42),
    ).frames[0]
    save_path = os.path.join(video_save_path, f"{i}.mp4")
    frames_save_path = os.path.join(video_save_path, type_to_eval, "video_frames")
    export_to_video_with_frames(video, save_path, frames_save_path, fps=8, eval_mode=True)
    

  4%|▍         | 2/50 [00:18<07:27,  9.32s/it]
  0%|          | 0/100 [00:21<?, ?it/s]


KeyboardInterrupt: 

In [None]:
with open(prompt_path, "r") as f:
    lines = f.readlines()
    meta_list = [line for line in lines]
# make meta dict 
meta_dict = {}
for i in meta_list:
    vid_id = str(i.split("/")[-1].split(".")[0])
    meta_dict[vid_id] = i


In [27]:
meta_dict

{'17778020': '{"video_latent_path": "video_latent/17778020.npy", "first_frame_latent_path": "first_frame_latent/17778020.npy", "prompt": "A musician in a red sweater is captured mid-movement playing a violin against a plain backdrop."}\n',
 '7999352': '{"video_latent_path": "video_latent/7999352.npy", "first_frame_latent_path": "first_frame_latent/7999352.npy", "prompt": "Lush green hillsides blanketed in dense forests, a winding road cutting through the grassy meadow, and a lone cabin resting in the peaceful valley."}\n',
 '5561378': '{"video_latent_path": "video_latent/5561378.npy", "first_frame_latent_path": "first_frame_latent/5561378.npy", "prompt": "A person with shoulder-length hair is wearing a satin outfit, standing in a room with a blurred background suggesting motion or activity."}\n',
 '7131718': '{"video_latent_path": "video_latent/7131718.npy", "first_frame_latent_path": "first_frame_latent/7131718.npy", "prompt": "man wearing a vest"}\n',
 '6250029': '{"video_latent_path