In [12]:
import os
import torch
import numpy as np
from diffusers import AutoencoderKLCogVideoX
import av

def decode_latents_to_video(latent_path, vae_model_path, output_video_path, height, width, fps=8, debug=False):
    """
    Decode VAE latents back into a video and save it as an .mp4 file.
    """
    # Load the latents
    latents = np.load(latent_path)
    latents = torch.from_numpy(latents).to(torch.float32)

    # Load the VAE model
    vae = AutoencoderKLCogVideoX.from_pretrained(vae_model_path, subfolder="vae")
    vae.eval()
    vae.to("cuda")

    # Decode latents
    with torch.no_grad():
        latents = latents.to("cuda")
        decoded_output = vae.decode(latents / vae.config.scaling_factor)
        frames = decoded_output.sample  # Extract frames from DecoderOutput

    # Debug: Check tensor shape
    print(f"Decoded tensor shape: {frames.shape}")

    # Remove batch and temporal dimensions
    frames = frames.squeeze(0).permute(1, 2, 3, 0)  # [B, F, C, H, W] -> [F, H, W, C]

    # Convert frames to numpy array in range [0, 255]
    frames = ((frames.cpu().numpy() + 1.0) / 2.0 * 255.0).astype(np.uint8)

    # Debug: Display first few frames
    if debug:
        import matplotlib.pyplot as plt
        for i in range(min(5, frames.shape[0])):
            plt.imshow(frames[i])
            plt.title(f"Frame {i}")
            plt.axis("off")
            plt.show()

    # Use PyAV for video writing
    container = av.open(output_video_path, mode="w")
    stream = container.add_stream("h264", rate=fps)
    stream.width = width
    stream.height = height
    stream.pix_fmt = "yuv420p"

    for frame in frames:
        resized_frame = torch.tensor(frame).numpy()
        resized_frame = np.ascontiguousarray(resized_frame)
        resized_frame = cv2.resize(resized_frame, (width, height))  # Resize to match dimensions
        frame_rgb = resized_frame[..., ::-1]  # Convert BGR to RGB
        frame_av = av.VideoFrame.from_ndarray(frame_rgb, format="rgb24")
        for packet in stream.encode(frame_av):
            container.mux(packet)

    # Flush and close the container
    container.mux(stream.encode(None))
    container.close()

    print(f"Decoded video saved at {output_video_path}")


In [13]:
# Example usage
latent_path = "/mnt/carpedkm_data/pexels_8fps_latents_1600/1002790_vae_latents.npy"
vae_model_path = "THUDM/CogVideoX-5b"
output_video_path = "./output_video_check_decode.mp4"
height = 480
width = 720
fps = 8
decode_latents_to_video(latent_path, vae_model_path, output_video_path, height, width, fps, debug=False)

The config attributes {'invert_scale_latents': False} were passed to AutoencoderKLCogVideoX, but are not expected and will be ignored. Please verify your config.json configuration file.


Decoded tensor shape: torch.Size([1, 3, 49, 480, 720])
Decoded video saved at ./output_video_check_decode.mp4
