In [2]:
pip install torch diffusers transformers accelerate

Collecting diffusers
  Downloading diffusers-0.34.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
Collecting accelerate
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub>=0.27.0 (from diffusers)
  Downloading huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from diffusers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting safetensors>=0.3.1 (from diffusers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub>=0.27.0->diffusers)
  Downloading hf_xet-1.1.5-cp37-abi3-manylinux_2_17_x86_64.manylinux201

In [5]:
pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
Installing collected packages: deep_translator
Successfully installed deep_translator-1.11.4
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (19 kB)
Downloading opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (67.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.12.0.88
Note: you may need to restart the kernel to use updated packages.


In [14]:
import torch
from diffusers import DiffusionPipeline
from PIL import Image
from IPython.display import Video
from deep_translator import GoogleTranslator
import numpy as np
import cv2
import os
import gc
import imageio
from tqdm import tqdm

def safe_image_list_from_array(arr):
    arr = np.asarray(arr)
    # If batch dimension is present, create a list of images
    if arr.ndim == 4 and arr.shape[-1] == 3:
        frames = []
        for i in range(arr.shape[0]):
            frame = arr[i]
            if frame.dtype != np.uint8:
                frame = (frame * 255).clip(0, 255).astype('uint8')
            frames.append(Image.fromarray(frame))
        return frames
    # If single image
    elif arr.ndim == 3 and arr.shape[-1] == 3:
        if arr.dtype != np.uint8:
            arr = (arr * 255).clip(0, 255).astype('uint8')
        return [Image.fromarray(arr)]
    else:
        raise ValueError(f"Frame shape not suitable for an image: {arr.shape}")

prompt = "Tampilkan video dua ekor kucing sedang bermain di taman"
prompt = GoogleTranslator(source='auto', target='en').translate(prompt)
print(f" Cleaned prompt: '{prompt}'")
pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16").to("cuda")

try:
    with torch.inference_mode():
        result = pipe(prompt=prompt, num_inference_steps=25, num_frames=96, guidance_scale=10.0)
except Exception as e:
    print("Generation failed:", e)
    torch.cuda.empty_cache()
    gc.collect()
video_frames = result["frames"]  # List of PIL.Image or np.ndarray or np.ndarray with batch

# Flatten all frames to a list of PIL.Image in RGB mode
frames_rgb = []
for idx, frame in tqdm(enumerate(video_frames), total=len(video_frames), desc="Converting frames..."):
    # If frame is ndarray, check for batch and convert
    if isinstance(frame, np.ndarray):
        imgs = safe_image_list_from_array(frame)
        for img in imgs:
            if img.mode != "RGB":
                img = img.convert("RGB")
            frames_rgb.append(img)
    else:
        if frame.mode != "RGB":
            frame = frame.convert("RGB")
        frames_rgb.append(frame)

# Get dimensions from first frame
first_frame = np.array(frames_rgb[0])
height, width, _ = first_frame.shape
frame_size = (width, height)

# Resize all frames to be exactly the same size as the first frame
frames_rgb = [frame.resize((width, height), Image.BICUBIC) for frame in tqdm(frames_rgb, desc="Resizing frames...")]

fps = 8
output_path = "surfing.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, frame_size)

# Write each frame
for frame in tqdm(frames_rgb, desc="Writing video with OpenCV..."):
    frame_bgr = cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR)  # Convert RGB (PIL) to BGR (OpenCV)
    out.write(frame_bgr)

out.release()
print(f"Video saved at: {os.path.abspath(output_path)}")

 Cleaned prompt: 'Show videos of two cats playing in the park'


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

The TextToVideoSDPipeline has been deprecated and will not receive bug fixes or feature updates after Diffusers version 0.33.1. 


  0%|          | 0/25 [00:00<?, ?it/s]

Converting frames...: 100%|██████████| 1/1 [00:00<00:00,  9.12it/s]
Resizing frames...: 100%|██████████| 96/96 [00:00<00:00, 10447.94it/s]
Writing video with OpenCV...: 100%|██████████| 96/96 [00:00<00:00, 1142.89it/s]

Video saved at: /home/alif_ahmad/work/Script/surfing.mp4



