<a href="https://colab.research.google.com/github/ahatasham5/image-to-video-/blob/main/ltx_model_i2v.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- ENVIRONMENT SETUP (NO UPSCALING) ---
!pip install --quiet torch torchvision
!pip install -q torchsde einops diffusers accelerate av
!apt -y install -qq aria2 ffmpeg

%cd /content

# --- Clone ComfyUI & Custom Nodes ---
!git clone --branch ComfyUI_v0.3.34 https://github.com/Isi-dev/ComfyUI
%cd /content/ComfyUI/custom_nodes
!git clone --branch forHidream https://github.com/Isi-dev/ComfyUI_GGUF.git
!git clone https://github.com/Isi-dev/ComfyUI_LTXVideo

# --- Install Node Requirements ---
%cd /content/ComfyUI/custom_nodes/ComfyUI_GGUF
!pip install -r requirements.txt
%cd /content/ComfyUI/custom_nodes/ComfyUI_LTXVideo
!pip install -r requirements.txt
%cd /content/ComfyUI

# --- Download Models (NO UPSCALERS) ---
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/wsbagnsv1/ltxv-13b-0.9.7-distilled-GGUF/resolve/main/ltxv-13b-0.9.7-distilled-Q6_K.gguf -d /content/ComfyUI/models/diffusion_models -o ltxv-13b-0.9.7-distilled-Q6_K.gguf
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp8_e4m3fn_scaled.safetensors
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/wsbagnsv1/ltxv-13b-0.9.7-dev-GGUF/resolve/main/ltxv-13b-0.9.7-vae-BF16.safetensors -d /content/ComfyUI/models/vae -o ltxv-13b-0.9.7-vae-BF16.safetensors


In [None]:
%cd /content/ComfyUI

In [None]:

from IPython.display import clear_output
clear_output()

# --- IMPORTS & PATH SETUP ---
import torch, gc, sys, random, os, imageio
import numpy as np
from PIL import Image
from IPython.display import display, HTML
sys.path.insert(0, '/content/ComfyUI')

from comfy import model_management

from nodes import (
    CheckpointLoaderSimple, CLIPLoader, CLIPTextEncode, VAELoader, VAEDecode, VAEDecodeTiled,
    LoadImage, ImageScale, SaveImage
)

from custom_nodes.ComfyUI_GGUF.nodes import UnetLoaderGGUF
from comfy_extras.nodes_custom_sampler import KSamplerSelect, SamplerCustom, RandomNoise
from comfy_extras.nodes_lt import LTXVPreprocess, LTXVImgToVideo, LTXVScheduler, LTXVConditioning
from custom_nodes.ComfyUI_LTXVideo.stg import STGGuiderAdvancedNode
from custom_nodes.ComfyUI_LTXVideo.easy_samplers import LTXVBaseSampler

# (NO UPSCALER modules imported)

def clear_gpu_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    for obj in list(globals().values()):
        if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
            del obj
    gc.collect()

def upload_image():
    from google.colab import files
    import shutil
    os.makedirs('/content/ComfyUI/input', exist_ok=True)
    uploaded = files.upload()
    for filename in uploaded.keys():
        src_path = f'/content/ComfyUI/{filename}'
        dest_path = f'/content/ComfyUI/input/{filename}'
        shutil.move(src_path, dest_path)
        print(f"Image saved to: {dest_path}")
        return dest_path
    return None

def string_to_float(string):
    float_list = [float(x.strip()) for x in string.split(',')]
    return (float_list,)

def float_to_sigmas(float_list):
    return torch.tensor(float_list, dtype=torch.float32),

def image_width_height(image):
    if image.ndim == 4:
        _, height, width, _ = image.shape
    elif image.ndim == 3:
        height, width, _ = image.shape
    else:
        raise ValueError(f"Unsupported image shape: {image.shape}")
    return width, height

def display_video(video_path):
    from IPython.display import HTML
    from base64 import b64encode
    mp4 = open(video_path,'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    display(HTML(f'<video width="512" controls><source src="{data_url}" type="video/mp4"></video>'))

def generate_video(
    image_path: str = None,
    positive_prompt: str = "A red fox moving gracefully",
    negative_prompt: str = "low quality, worst quality",
    width: int = 768,
    height: int = 512,
    seed: int = 0,
    steps: int = 30,
    cfg_scale: float = 2.05,
    sampler_name: str = "euler",
    length: int = 24,
    fps: int = 24,
    upscale_video: bool = False # <- will be ignored
):
    with torch.inference_mode():
        unet_loader = UnetLoaderGGUF()
        vae_loader = VAELoader()
        checkpoint_loader = CheckpointLoaderSimple()
        clip_loader = CLIPLoader()
        clip_encode_positive = CLIPTextEncode()
        clip_encode_negative = CLIPTextEncode()
        load_image = LoadImage()
        image_scaler = ImageScale()
        save_node = SaveImage()
        preprocess = LTXVPreprocess()
        img_to_video = LTXVImgToVideo()
        scheduler = LTXVScheduler()
        sampler_select = KSamplerSelect()
        random_noise = RandomNoise()
        conditioning = LTXVConditioning()
        sampler = SamplerCustom()
        vae_decode = VAEDecode()
        stg_guider_advanced = STGGuiderAdvancedNode()
        ltxv_base_sampler = LTXVBaseSampler()
        vae_decode_tiled = VAEDecodeTiled()

        print("Loading Text_Encoder...")
        clip = clip_loader.load_clip("t5xxl_fp8_e4m3fn_scaled.safetensors", "ltxv", "default")[0]

        assert width % 32 == 0, "Width must be divisible by 32"
        assert height % 32 == 0, "Height must be divisible by 32"

        positive = clip_encode_positive.encode(clip, positive_prompt)[0]
        negative = clip_encode_negative.encode(clip, negative_prompt)[0]

        torch.save(positive, '/content/positive_embedding.pt')
        torch.save(negative, '/content/negative_embedding.pt')

        del clip
        torch.cuda.empty_cache()
        gc.collect()

        if image_path is None:
            print("Please upload an image file:")
            image_path = upload_image()
        if image_path is None:
            print("No image uploaded!")
        loaded_image = load_image.load_image(image_path)[0]

        width_int, height_int = image_width_height(loaded_image)
        if width == 0 and height == 0:
            if width_int > height_int:
                width = 768
                height = 512
            elif width_int == height_int:
                width = 512
                height = 512
            else:
                width = 512
                height = 768

        print("Loading UNet model...")
        model = unet_loader.load_unet("ltxv-13b-0.9.7-distilled-Q6_K.gguf")[0]

        conditionedPositive, conditionedNegative = conditioning.append(positive, negative, 25.0)

        guider = stg_guider_advanced.get_guider(
            model, conditionedPositive, conditionedNegative, 0.997, True,
            "1.0, 0.9933, 0.9850, 0.9767, 0.9008, 0.6180",
            "1,1,1,1,1,1", "0,0,0,0,0,0", "1, 1, 1, 1, 1, 1", "[25], [35], [35], [42], [42], [42]"
        )[0]

        print("Loading VAE...")
        vae = vae_loader.load_vae("ltxv-13b-0.9.7-vae-BF16.safetensors")[0]

        selected_sampler = sampler_select.get_sampler(sampler_name)[0]
        sigmas = float_to_sigmas(string_to_float("1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250, 0.4219, 0.0")[0])[0]
        noise = random_noise.get_noise(seed)[0]

        # Rescale to desired size (NO UPSCALING)
        loaded_image = image_scaler.upscale(loaded_image, "lanczos", width, height, "disabled")[0]

        try:
            print("Generating video...")
            sampled = ltxv_base_sampler.sample(
                model, vae, width, height, length, guider, selected_sampler,
                sigmas, noise, optional_cond_images=loaded_image, optional_cond_indices="0",
                strength=0.8, crop="disabled", crf=30, blur=1
            )[0]
            torch.save(sampled, '/content/sample_latents.pt')
            print("Latent saved as /content/sample_latents.pt")
            del model, guider, noise
            torch.cuda.empty_cache(); gc.collect()

            print("Decoding latents...")
            decoded = vae_decode.decode(vae, sampled)[0].detach()
            del vae, sampled
            torch.cuda.empty_cache(); gc.collect()

            # Rescale to original image size (NO UPSCALING)
            decoded = image_scaler.upscale(decoded, "lanczos", width_int, height_int, "disabled")[0]

            output_path = "/content/output.mp4"
            frames_np = (decoded.cpu().numpy() * 255).astype(np.uint8)
            del decoded
            torch.cuda.empty_cache(); gc.collect()

            with imageio.get_writer(output_path, fps=fps) as writer:
                for frame in frames_np:
                    writer.append_data(frame)

            print(f"\nVideo generation complete! Displaying Video...")
            display_video(output_path)

        except Exception as e:
            print(f"Error during video generation: {str(e)}")
            raise
        finally:
            clear_gpu_memory()

print("✅ Environment Setup Complete!")


In [None]:
file_uploaded = upload_image()

display_upload = False  # Set to True to display uploaded image in notebook output
if display_upload:
    from IPython.display import Image as IPImage, display
    if file_uploaded and file_uploaded.lower().endswith(('.png', '.jpg', '.jpeg')):
        display(IPImage(filename=file_uploaded))
    else:
        print("The image format cannot be displayed.")

In [None]:
# Prompt crafted for your scene
positive_prompt = (
    "An anime-style cinematic scene of a lone young man with dark hair and a travel bag "
    "walking towards the grand gates of a Japanese academy, golden sunlight streaming through the trees, "
    "detailed background, hopeful and dramatic atmosphere, soft clouds in the sky, highly detailed, anime art"
)

negative_prompt = "low quality, blurry, poorly drawn, deformed, low resolution, bad anatomy"

# Now generate the video:
generate_video(
    image_path=file_uploaded,
    positive_prompt=positive_prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    length=24,
    fps=24,
    seed=42,
    steps=30,
    upscale_video=False  # No upscaling
)

In [None]:
positive_prompt = (
    "cinematic wide shot, a large gorilla enters the sunlit scene and warmly hugs a surprised young man, "
    "soft dynamic shadows, emotional anime style, detailed background, flowing camera movement, "
    "expressive lighting, dramatic moment, fluid animation, subtle particles in air"
)

negative_prompt = (
    "low quality, blurry, poorly drawn, deformed, bad anatomy, disfigured, motion smear, "
    "motion artifacts, fused fingers, weird hands, ugly"
)

# Set for 8 seconds of animation (24fps × 8 + 1 = 193 frames)
generate_video(
    image_path=file_uploaded,
    positive_prompt=positive_prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    length=193,  # <- longer video for visible animation!
    fps=24,
    seed=42,
    steps=30,
    upscale_video=False
)


In [None]:
suaib = upload_image()

#file_uploaded = "/content/ComfyUI/e8521f2e-49b0-42bd-9270-b705a6d763c5.jfif"  # your uploaded image path

positive_prompt = (
    "two young men standing close together, smiling warmly and looking at each other with romantic look, "
    "friendly atmosphere, candid selfie moment, Happy together, "
    "well-groomed, natural expressions, clean background, sharp focus, realistic style, positive mood"
)

negative_prompt = (
    "low quality, blurry, awkward pose, looking away, distorted faces, bad anatomy, poorly drawn hands, overexposed, cartoon, illustration"
)
generate_video(
    image_path=suaib,
    positive_prompt=positive_prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=768,         # square to fit your image shape
    length=97,          # ~2 seconds of animation at 24fps = 49
    fps=24,
    seed=77,            # any number you like
    steps=30,
    upscale_video=False
)


For upscaling
# New Section

In [None]:
# for upscalling
import torch
torch.save(sampled, '/content/sample_latents.pt')


In [None]:
# -- Install dependencies --
!pip install --quiet torch torchvision torchsde einops diffusers accelerate av

# -- System packages --
!apt -y install -qq aria2 ffmpeg

# -- Clone ComfyUI and custom nodes --
%cd /content
!git clone --branch ComfyUI_v0.3.34 https://github.com/Isi-dev/ComfyUI
%cd /content/ComfyUI/custom_nodes
!git clone --branch forHidream https://github.com/Isi-dev/ComfyUI_GGUF.git
!git clone https://github.com/Isi-dev/ComfyUI_LTXVideo

# -- Install requirements for nodes --
%cd /content/ComfyUI/custom_nodes/ComfyUI_GGUF
!pip install -r requirements.txt
%cd /content/ComfyUI/custom_nodes/ComfyUI_LTXVideo
!pip install -r requirements.txt
%cd /content/ComfyUI

# -- Download models (edit these if you have your own paths) --
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/wsbagnsv1/ltxv-13b-0.9.7-distilled-GGUF/resolve/main/ltxv-13b-0.9.7-distilled-Q6_K.gguf -d /content/ComfyUI/models/diffusion_models -o ltxv-13b-0.9.7-distilled-Q6_K.gguf
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/wsbagnsv1/ltxv-13b-0.9.7-dev-GGUF/resolve/main/ltxv-13b-0.9.7-vae-BF16.safetensors -d /content/ComfyUI/models/vae -o ltxv-13b-0.9.7-vae-BF16.safetensors
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Lightricks/LTX-Video/resolve/main/ltxv-spatial-upscaler-0.9.7.safetensors -d /content/ComfyUI/models/upscale_models -o ltxv-spatial-upscaler-0.9.7.safetensors

from IPython.display import clear_output
clear_output()

# Now upload your latent file if you haven't already (e.g., via Colab file upload UI)


In [None]:
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Lightricks/LTX-Video/resolve/main/ltxv-spatial-upscaler-0.9.7.safetensors -d /content/ComfyUI/models/upscale_models -o ltxv-spatial-upscaler-0.9.7.safetensors

In [None]:
import torch
import gc
from comfy import model_management

# Re-import all needed upscaler and utility modules
from custom_nodes.ComfyUI_GGUF.nodes import UnetLoaderGGUF
from nodes import VAELoader, VAEDecodeTiled, ImageScale
from comfy_extras.nodes_custom_sampler import RandomNoise
from custom_nodes.ComfyUI_LTXVideo.stg import STGGuiderAdvancedNode
from custom_nodes.ComfyUI_LTXVideo.latent_upsampler import (
    LTXVLatentUpsamplerModelLoader, LTXVLatentUpsampler
)
from custom_nodes.ComfyUI_LTXVideo.latent_adain import LTXVAdainLatent
from custom_nodes.ComfyUI_LTXVideo.tiled_sampler import LTXVTiledSampler

# --- Load your sampled latents ---
latents = torch.load('/content/sample_latents.pt', map_location='cuda')  # or 'cpu' if you don't have a GPU

# --- Model and upscaler loading (paths must match your session/setup) ---
unet_loader = UnetLoaderGGUF()
vae_loader = VAELoader()
vae_decode_tiled = VAEDecodeTiled()
image_scaler = ImageScale()
upscale_model_loader = LTXVLatentUpsamplerModelLoader()
latent_upsampler = LTXVLatentUpsampler()
adain_latent = LTXVAdainLatent()
tiled_sampler = LTXVTiledSampler()
random_noise = RandomNoise()
stg_guider_advanced = STGGuiderAdvancedNode()

# Load UNet, VAE, Upscale models (use the same model names as in the original code)
model = unet_loader.load_unet("ltxv-13b-0.9.7-distilled-Q6_K.gguf")[0]
vae = vae_loader.load_vae("ltxv-13b-0.9.7-vae-BF16.safetensors")[0]
upscale_model = upscale_model_loader.load_model(
    "ltxv-spatial-upscaler-0.9.7.safetensors", True, False
)[0]

# Provide the same positive/negative conditioning as original (you may need to reload text encodings or save them in step 1)
# For brevity, I'll use dummy values:
conditionedPositive = torch.zeros((1,))  # <-- Replace with your original values!
conditionedNegative = torch.zeros((1,))

conditionedPositive = torch.load('/content/positive_embedding.pt', map_location='cuda')
conditionedNegative = torch.load('/content/negative_embedding.pt', map_location='cuda')


# Also, you’ll need to set the guider and other parameters just as in your original upscaling block:
tiled_guider = stg_guider_advanced.get_guider(
    model, conditionedPositive, conditionedNegative,
    0.997, True, "1", "1", "0", "1", "[42]"
)[0]

import numpy as np
def string_to_float(string):
    float_list = [float(x.strip()) for x in string.split(',')]
    return (float_list,)

def float_to_sigmas(float_list):
    return torch.tensor(float_list, dtype=torch.float32),

tiled_sigmas = float_to_sigmas(
    string_to_float("0.85, 0.7250, 0.6, 0.4219, 0.0")[0]
)[0]

# --- Actual upscaling pipeline (copied from your original code) ---
upscaled_latents = latent_upsampler.upsample_latent(
    latents, upscale_model, vae
)[0]

adjusted_latents = adain_latent.batch_normalize(
    upscaled_latents, latents, 0.25
)[0]

# Release models as needed
del latents, upscale_model, upscaled_latents
gc.collect(); torch.cuda.empty_cache()

# Generate noise for tiling
seed = 77  # use same seed as before!
tiled_noise = random_noise.get_noise(seed)[0]

# Specify your width/height/etc. as needed, or restore from metadata
width = 512
height = 512

# The `loaded_image` should match the conditions (may use a dummy/zeros if not using cond images)
loaded_image = torch.zeros((1, height, width, 3), dtype=torch.float32) # adjust as needed

# Tiled sampling
tiled_output, _ = tiled_sampler.sample(
    model=model,
    vae=vae,
    noise=tiled_noise,
    sampler=None,  # supply your sampler
    sigmas=tiled_sigmas,
    guider=tiled_guider,
    latents=adjusted_latents,
    optional_cond_images=loaded_image,
    horizontal_tiles=1,
    vertical_tiles=1,
    overlap=1,
    latents_cond_strength=0.15,
    boost_latent_similarity=False,
    crop="disabled",
    optional_cond_indices="0",
    images_cond_strengths="0.9"
)



# --- After tiled_output ---
upscaled_latents = tiled_output["samples"]
latent_input = { "samples": upscaled_latents }
decoded_frames = vae_decode_tiled.decode(vae, latent_input, width, 64, 64, 8)[0]

# Upscale to 1080p square (or 1920, 1080 for landscape)
decoded_frames = image_scaler.upscale(decoded_frames, "lanczos", 1080, 1080, "disabled")[0]

import imageio
output_pathU = "/content/upscaled.mp4"
frames_npu = (decoded_frames.cpu().numpy() * 255).astype(np.uint8)
with imageio.get_writer(output_pathU, fps=12) as writer:  # Set FPS as you want
    for frame in frames_npu:
        writer.append_data(frame)

print("Upscaled video saved to:", output_pathU)
