In [None]:

prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."

pipe = CogVideoXPipeline.from_pretrained(
    "THUDM/CogVideoX-5b",
    torch_dtype=torch.bfloat16
)

pipe.enable_model_cpu_offload()
pipe.vae.enable_tiling()

video = pipe(
    prompt=prompt,
    num_videos_per_prompt=1,
    num_inference_steps=50,
    num_frames=49,
    guidance_scale=6,
    generator=torch.Generator(device="cuda").manual_seed(42),
).frames[0]

export_to_video(video, "output.mp4", fps=8)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  9.78it/s]it/s]
Loading pipeline components...: 100%|██████████| 5/5 [00:01<00:00,  3.99it/s]
100%|██████████| 50/50 [03:48<00:00,  4.58s/it]
It is recommended to use `export_to_video` with `imageio` and `imageio-ffmpeg` as a backend. 
These libraries are not present in your environment. Attempting to use legacy OpenCV backend to export video. 
Support for the OpenCV backend will be deprecated in a future Diffusers version


'output.mp4'

In [1]:
import os
import torch
from diffusers import CogVideoXPipeline
from diffusers.utils import export_to_video, export_to_video_with_frames
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
save_dir = "/mnt/carpedkm_data/temporal_eval_result/t2v_vanilla"
prompt_path="/mnt/carpedkm_data/image_gen_ds/Pexels_subset_100K_fps8_flow-25-50_sample500/medium/metadata.jsonl"
temporal_eval_first_frame="/mnt/carpedkm_data/image_gen_ds/Pexels_subset_100K_fps8_flow-25-50_sample500/medium/first_frame"
temporal_eval_type = "small"
temporal_eval_use_amount = 300
temporal_eval_shard = 0

In [3]:
pipe = CogVideoXPipeline.from_pretrained(
    "THUDM/CogVideoX-5b",
    torch_dtype=torch.bfloat16
)

pipe.enable_model_cpu_offload()
pipe.vae.enable_tiling()

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  9.65it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 32.92it/s]it/s]
Loading pipeline components...: 100%|██████████| 5/5 [00:00<00:00,  5.52it/s]


In [9]:
os.makedirs(save_dir, exist_ok=True)
output_dir = os.path.join(save_dir, temporal_eval_type)
os.makedirs(output_dir, exist_ok=True)
resizing=False
meta_path = prompt_path
meta_list = []
with open(meta_path, 'r') as f:
    for line in f:
        try:
            meta_list.append(json.loads(line))
        except:
            print('Error in loading json')
# make dictionary to parse the video id : the other info
meta_dict = {}
for meta in meta_list:
    vid_id = str(meta['video_latent_path'].split('/')[-1].split('.')[0])
    meta_dict[vid_id] = meta
input_image_path = temporal_eval_first_frame # prepend 'small', 'medium', 'large'
input_image_list = sorted(os.listdir(input_image_path))

In [10]:
shard_amount = temporal_eval_use_amount // 4

In [12]:
for i in range(temporal_eval_shard * shard_amount, (temporal_eval_shard + 1) * shard_amount):
    print(f"Processing {i}th video")
    input_image = os.path.join(input_image_path, input_image_list[i])
    vid_id = str(input_image_list[i].split('.')[0])
    if os.path.exists(os.path.join(output_dir, 'video_frames', vid_id)):
        print('Already exists: ', os.path.join(output_dir, 'video_frames', vid_id))
        continue
    if vid_id in meta_dict.keys():
        prompt = meta_dict[vid_id]['prompt']
    else:
        print('No prompt found for vid_id: ', vid_id)
        break
    video = pipe(
        prompt=prompt,
        num_videos_per_prompt=1,
        num_inference_steps=50,
        num_frames=1,
        guidance_scale=6,
        generator=torch.Generator(device="cuda").manual_seed(42),
    ).frames[0]
    vid_save_dir = os.path.join(output_dir, 'videos')
    os.makedirs(vid_save_dir, exist_ok=True)
    frames_save_dir = os.path.join(output_dir, 'video_frames', vid_id)
    os.makedirs(frames_save_dir, exist_ok=True)
    # export_to_video(video, "output.mp4", fps=8)
    export_to_video_with_frames(
        video_frames=video,
        output_video_path=os.path.join(vid_save_dir, f"{vid_id}.mp4"),
        output_frames_dir=frames_save_dir,
        fps=8,
        eval_mode=True,
    )

Processing 0th video
Already exists:  /mnt/carpedkm_data/temporal_eval_result/t2v_vanilla/small/video_frames/10184754
Processing 1th video
Already exists:  /mnt/carpedkm_data/temporal_eval_result/t2v_vanilla/small/video_frames/10276206
Processing 2th video
Already exists:  /mnt/carpedkm_data/temporal_eval_result/t2v_vanilla/small/video_frames/10318435
Processing 3th video


100%|██████████| 50/50 [00:15<00:00,  3.22it/s]


Processing 4th video


100%|██████████| 50/50 [00:15<00:00,  3.22it/s]


Processing 5th video


100%|██████████| 50/50 [00:15<00:00,  3.21it/s]


Processing 6th video


100%|██████████| 50/50 [00:15<00:00,  3.21it/s]


Processing 7th video


100%|██████████| 50/50 [00:15<00:00,  3.19it/s]


Processing 8th video


100%|██████████| 50/50 [00:15<00:00,  3.20it/s]


Processing 9th video


 22%|██▏       | 11/50 [00:05<00:18,  2.08it/s]


KeyboardInterrupt: 