In [None]:
%cd /content/ComfyUI

import os, json, requests, random, time
from urllib.parse import urlsplit

import torch
from PIL import Image
import numpy as np

import asyncio
import execution
import server
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
server_instance = server.PromptServer(loop)
execution.PromptQueue(server)

from nodes import load_custom_node
from nodes import NODE_CLASS_MAPPINGS

load_custom_node("/content/ComfyUI/custom_nodes/ComfyUI-CogVideoXWrapper")
load_custom_node("/content/ComfyUI/custom_nodes/ComfyUI-VideoHelperSuite")
load_custom_node("/content/ComfyUI/custom_nodes/ComfyUI-KJNodes")

LoadImage = NODE_CLASS_MAPPINGS["LoadImage"]()
ImageResizeKJ = NODE_CLASS_MAPPINGS["ImageResizeKJ"]()
CogVideoImageEncode = NODE_CLASS_MAPPINGS["CogVideoImageEncode"]()
CogVideoLoraSelect = NODE_CLASS_MAPPINGS["CogVideoLoraSelect"]()
DownloadAndLoadCogVideoModel = NODE_CLASS_MAPPINGS["DownloadAndLoadCogVideoModel"]()
CogVideoTextEncode = NODE_CLASS_MAPPINGS["CogVideoTextEncode"]()
CLIPLoader = NODE_CLASS_MAPPINGS["CLIPLoader"]()
CogVideoSampler = NODE_CLASS_MAPPINGS["CogVideoSampler"]()
CogVideoDecode = NODE_CLASS_MAPPINGS["CogVideoDecode"]()
VHS_VideoCombine = NODE_CLASS_MAPPINGS["VHS_VideoCombine"]()

with torch.inference_mode():
    lora = CogVideoLoraSelect.getlorapath("orbit_up_lora_weights.safetensors", 1.0, fuse_lora=True)[0]
    pipeline = DownloadAndLoadCogVideoModel.loadmodel("THUDM/CogVideoX-5b-I2V", "bf16", fp8_transformer="disabled", compile="disabled", enable_sequential_cpu_offload=False, lora=lora)[0]
    clip = CLIPLoader.load_clip("t5xxl_fp16.safetensors", type="sd3")[0]

def download_file(url, save_dir, file_name):
    os.makedirs(save_dir, exist_ok=True)
    file_suffix = os.path.splitext(urlsplit(url).path)[1]
    file_name_with_suffix = file_name + file_suffix
    file_path = os.path.join(save_dir, file_name_with_suffix)
    response = requests.get(url)
    response.raise_for_status()
    with open(file_path, 'wb') as file:
        file.write(response.content)
    return file_path

@torch.inference_mode()
def generate(input):
    values = input["input"]

    input_image=values['input_image_check']
    input_image=download_file(url=input_image, save_dir='/content/ComfyUI/input', file_name='input_image')
    prompt = values['prompt']
    negative_prompt = values['negative_prompt']
    seed = values['seed']
    steps = values['steps']
    cfg = values['cfg']

    if seed == 0:
        random.seed(int(time.time()))
        seed = random.randint(0, 18446744073709551615)

    positive = CogVideoTextEncode.process(clip, prompt, strength=1.0, force_offload=True)[0]
    negative = CogVideoTextEncode.process(clip, negative_prompt, strength=1.0, force_offload=True)[0]

    image, _ = LoadImage.load_image(input_image)
    image = ImageResizeKJ.resize(image, width=720, height=480, keep_proportion=False, upscale_method="lanczos", divisible_by=16, crop="center")[0]
    image_cond_latents = CogVideoImageEncode.encode(pipeline, image, chunk_size=16, enable_tiling=True)[0]
    samples = CogVideoSampler.process(pipeline, positive, negative, steps, cfg, seed, height=480, width=720, num_frames=49, scheduler="CogVideoXDPMScheduler", denoise_strength=1.0, image_cond_latents=image_cond_latents)
    frames = CogVideoDecode.decode(samples[0], samples[1], enable_vae_tiling=True, tile_sample_min_height=240, tile_sample_min_width=360, tile_overlap_factor_height=0.2, tile_overlap_factor_width=0.2, auto_tile_size=True)[0]

    out_video = VHS_VideoCombine.combine_video(images=frames, frame_rate=8, loop_count=0, filename_prefix="CogVideoX-I2V", format="video/h264-mp4", save_output=True)
    source = out_video["result"][0][1][1]
    destination = '/content/ComfyUI/output/cogvideox-5b-i2v-dimensionx-tost.mp4'
    shutil.move(source, destination)

    return destination

In [None]:
input = { 
    "input": {
        "input_image_check": "https://files.catbox.moe/flj6kn.png",
        "prompt": "Cinematic scene, camera orbits.",
        "negative_prompt": "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
        "seed": 0,
        "steps": 10,
        "cfg": 6.0,
    }
}
image = generate(input)
image