In [1]:
import torch
from diffusers import DiffusionPipeline
from PIL import Image
from deep_translator import GoogleTranslator
import numpy as np
import cv2
import os
import gc
from tqdm import tqdm

# ============================== CONFIG ==============================

MODEL_ID = "damo-vilab/text-to-video-ms-1.7b"
NUM_FRAMES = 96
FPS = 8
GUIDANCE_SCALE = 10.0
INFERENCE_STEPS = 25
OUTPUT_FILENAME = "surfing.mp4"
TARGET_LANGUAGE = 'en'

# ============================ UTILITIES =============================

def safe_image_list_from_array(arr):
    """Convert numpy arrays to a list of PIL Images in RGB mode."""
    arr = np.asarray(arr)
    frames = []

    if arr.ndim == 4 and arr.shape[-1] == 3:
        for i in range(arr.shape[0]):
            frame = arr[i]
            if frame.dtype != np.uint8:
                frame = (frame * 255).clip(0, 255).astype('uint8')
            img = Image.fromarray(frame)
            if img.mode != "RGB":
                img = img.convert("RGB")
            frames.append(img)
        return frames

    elif arr.ndim == 3 and arr.shape[-1] == 3:
        if arr.dtype != np.uint8:
            arr = (arr * 255).clip(0, 255).astype('uint8')
        img = Image.fromarray(arr)
        if img.mode != "RGB":
            img = img.convert("RGB")
        return [img]

    else:
        raise ValueError(f"Frame shape not suitable for an image: {arr.shape}")

# ============================== MAIN ===============================

def main():
    prompt_text = "Tampilkan video dua ekor kucing sedang bermain di taman"

    try:
        # Translate prompt to English
        prompt = GoogleTranslator(source='auto', target=TARGET_LANGUAGE).translate(prompt_text)
        print(f"Translated prompt: '{prompt_text}' → '{prompt}'")

        # Load model
        pipe = DiffusionPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.float16, variant="fp16").to("cuda")

        # Run inference
        with torch.inference_mode():
            result = pipe(prompt=prompt,
                          num_inference_steps=INFERENCE_STEPS,
                          num_frames=NUM_FRAMES,
                          guidance_scale=GUIDANCE_SCALE)

        video_frames = result["frames"]

        # Convert all frames to RGB PIL images
        frames_rgb = []
        for frame in tqdm(video_frames, desc="Converting frames..."):
            if isinstance(frame, np.ndarray):
                frames_rgb.extend(safe_image_list_from_array(frame))
            else:
                if frame.mode != "RGB":
                    frame = frame.convert("RGB")
                frames_rgb.append(frame)

        del result
        torch.cuda.empty_cache()
        gc.collect()

        # Get dimensions from the first frame
        first_frame = np.array(frames_rgb[0])
        height, width, _ = first_frame.shape
        frame_size = (width, height)

        # Resize all frames to consistent dimensions
        frames_rgb = [
            frame.resize((width, height), Image.BICUBIC)
            for frame in tqdm(frames_rgb, desc="Resizing frames...")
        ]

        # Save video using OpenCV
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(OUTPUT_FILENAME, fourcc, FPS, frame_size)

        for frame in tqdm(frames_rgb, desc="Writing video with OpenCV..."):
            frame_bgr = cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR)
            out.write(frame_bgr)

        out.release()
        print(f"\n✅ Video saved at: {os.path.abspath(OUTPUT_FILENAME)}")

    except Exception as e:
        print(f"❌ Error: {e}")

    finally:
        torch.cuda.empty_cache()
        gc.collect()

# ============================== ENTRY ==============================

if __name__ == "__main__":
    main()

Translated prompt: 'Tampilkan video dua ekor kucing sedang bermain di taman' → 'Show videos of two cats playing in the park'


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

The TextToVideoSDPipeline has been deprecated and will not receive bug fixes or feature updates after Diffusers version 0.33.1. 


  0%|          | 0/25 [00:00<?, ?it/s]

Converting frames...: 100%|██████████| 1/1 [00:00<00:00,  8.65it/s]
Resizing frames...: 100%|██████████| 96/96 [00:00<00:00, 8253.63it/s]
Writing video with OpenCV...: 100%|██████████| 96/96 [00:00<00:00, 1127.37it/s]



✅ Video saved at: /home/alif_ahmad/work/Script/surfing.mp4
