In [58]:
import os
from tqdm import tqdm

In [59]:
path = '/mnt/video_data/pexels/metadata/results_400k_train_rfcap_pexels_reformat_clean.csv'

In [60]:
# Read in from CSV file
with open(path, 'r') as file:
    data = file.readlines()

In [61]:
import csv

data_dict = {}

# Assuming "video_data.csv" is your file:
with open(path, 'r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',', quotechar='"')
    next(reader)  # Skip the header if there is one
    for row in reader:
        # Adjust the indices depending on your CSV's actual columns:
        vid_id    = row[0]
        text      = row[1]
        fps       = row[2]
        num_frame = row[3]
        h         = row[4]
        w         = row[5]

        data_dict[vid_id] = {
            'text': text, 
            'fps': fps, 
            'h': h, 
            'w': w, 
            'num_frame': num_frame
        }

# Now data_dict should contain the correct mappings,
# with commas inside the text field preserved.


In [62]:
data_dict

{'pexels/videos-popular/8620887': {'text': ' father carrying his son on a beach',
  'fps': '25.0',
  'h': '2160',
  'w': '3840',
  'num_frame': '384'},
 'pexels/videos-popular/5935446': {'text': ' group of friends having a toast',
  'fps': '25.0',
  'h': '2160',
  'w': '3840',
  'num_frame': '327'},
 'pexels/videos-popular/7187451': {'text': ' tattooed woman climbing rocks',
  'fps': '24.0',
  'h': '1080',
  'w': '1920',
  'num_frame': '324'},
 'pexels/videos-popular/4305111': {'text': ' ray of the sun on the forest ground',
  'fps': '29.97002997002997',
  'h': '1080',
  'w': '1920',
  'num_frame': '1228'},
 'pexels/videos-popular/4623273': {'text': 'Students walk along a campus sidewalk, surrounded by trees and modern buildings, while two individuals pause to chat beside a bicycle.',
  'fps': '24.0',
  'h': '2160',
  'w': '3840',
  'num_frame': '283'},
 'pexels/videos-popular/5940459': {'text': ' woman taling in front of the class in a library',
  'fps': '25.0',
  'h': '2160',
  'w': 

In [63]:
# Filter data_dict to only include videos in horizontal manner by calculating aspect ratio, and short videos with less than 50 frames, and print out message which video is filtered for which reason
filtered_data_dict = {}
asp_cnter = 0
frame_cnter = 0
for vid_id, info in tqdm(data_dict.items()):
    h = float(info['h'])
    w = float(info['w'])
    num_frame = int(info['num_frame'])
    aspect_ratio = w / h
    if aspect_ratio >= 1.5 and num_frame > 50:
        filtered_data_dict[vid_id] = info
    else:
        if aspect_ratio < 1.5:
            # print(f'Filtered video {vid_id} because of aspect ratio {aspect_ratio}')
            asp_cnter += 1
        if num_frame < 50:
            # print(f'Filtered video {vid_id} because of num_frame {num_frame}')
            frame_cnter += 1
print(f'Filtered {asp_cnter} videos because of aspect ratio, and {frame_cnter} videos because of num_frame')


 31%|███       | 121563/394468 [00:00<00:00, 1215515.24it/s]

100%|██████████| 394468/394468 [00:00<00:00, 1289382.38it/s]

Filtered 110286 videos because of aspect ratio, and 10 videos because of num_frame





In [64]:
# Write filtered data_dict to a new CSV file
new_path = '/mnt/carpedkm_data/image_gen_ds/second_stage_video_train'
os.makedirs(new_path, exist_ok=True)

In [65]:
# save as json file
import json
with open(os.path.join(new_path, 'second_stage_video_filtered_data_dict.json'), 'w') as f:
    json.dump(filtered_data_dict, f)

## VAE extraction code testing

In [66]:
# %%
# Import necessary packages
import os
import json
import cv2
import torch
import numpy as np
from tqdm import tqdm
from pathlib import Path
from diffusers import AutoencoderKLCogVideoX
import decord
from decord import VideoReader
from multiprocessing import Process, Queue, Value

In [67]:
def process_video(queue, progress_queue, vae_model_path, max_frames, width, height, gpu_id, output_dir, fps):
    """
    Process videos assigned to a specific GPU.
    """
    device = f"cuda:{gpu_id}"

    # Load the VAE model
    vae = AutoencoderKLCogVideoX.from_pretrained(vae_model_path, subfolder="vae")
    vae.to(device)
    vae.eval()

    while True:
        path_and_fps = queue.get()
        if video_path is None:  # End signal
            break
        video_path, original_fps = path_and_fps

        try:
            # Load video using Decord
            decord.bridge.set_bridge("native")
            vr = VideoReader(video_path, ctx=decord.cpu(0))

            # Calculate frame interval
            original_fps = float(original_fps)
            frame_interval = int(original_fps / fps)

            # Extract frames
            # frames = vr.get_batch(range(0, min(len(vr), max_frames * frame_interval), frame_interval)).asnumpy()
            frames = vr.get_batch(range(0, min(len(vr), max_frames * frame_interval), frame_interval)).asnumpy()

            # Ensure exact number of frames
            if frames.shape[0] < max_frames:
                pad_frames = max_frames - frames.shape[0]
                print('>> shorter than max_frames : doing padding')
                frames = np.pad(frames, ((0, pad_frames), (0, 0), (0, 0), (0, 0)), mode="constant")
            elif frames.shape[0] > max_frames:
                frames = frames[:max_frames]
            # Resize frames using OpenCV
            frames = np.array([cv2.resize(frame, (width, height), interpolation=cv2.INTER_LINEAR) for frame in frames])

            # Convert to torch tensor and preprocess
            frames = torch.from_numpy(frames).float() / 255.0 * 2.0 - 1.0  # Normalize [-1, 1]
            frames = frames.permute(0, 3, 1, 2)  # [F, H, W, C] -> [F, C, H, W]
            
            # Add batch dimension and permute for VAE
            frames = frames.unsqueeze(0).permute(0, 2, 1, 3, 4).to(device)  # [B, C, F, H, W]

            # Encode video to latent space
            with torch.no_grad():
                latent_dist = vae.encode(frames).latent_dist
                latents = latent_dist.sample() * vae.config.scaling_factor

            # Save latents
            output_path = os.path.join(output_dir, Path(video_path).stem + "_vae_latents.npy")
            np.save(output_path, latents.cpu().numpy())

        except Exception as e:
            print(f"Error processing video {video_path}: {e}")

        # Clear GPU memory
        torch.cuda.empty_cache()

        # Notify progress
        progress_queue.put(1)


def extract_vae_latents(
    video_dir, video_keys, vae_model_path, output_dir, height=480, width=720, max_frames=49, fps=8, video_dict=None,
):
    """
    Extract VAE latents using multiple GPUs with controlled processes.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Detect available GPUs
    available_gpus = list(range(torch.cuda.device_count()))
    if not available_gpus:
        raise RuntimeError("No GPUs are available!")

    print(f"Using GPUs: {available_gpus}")

    # Create a process for each GPU
    queues = [Queue() for _ in available_gpus]
    progress_queue = Queue()
    processes = []

    for gpu_id, queue in zip(available_gpus, queues):
        process = Process(target=process_video, args=(queue, progress_queue, vae_model_path, max_frames, width, height, gpu_id, output_dir, fps))
        process.start()
        processes.append(process)

    # Get video paths and original FPS pairs
    video_paths = [os.path.join(video_dir, vid_id + '.mp4') for vid_id in sorted(video_keys)]
    video_fps = [video_dict[vid_id]['fps'] for vid_id in sorted(video_keys)]
    # Distribute videos to queues
    for i, path_and_fps in enumerate(zip(video_paths, video_fps)):
        
        queues[i % len(available_gpus)].put(path_and_fps)

    # Send termination signals
    for queue in queues:
        queue.put(None)

    # Track progress using tqdm
    with tqdm(total=len(video_paths), desc="Extracting VAE latents") as pbar:
        completed = 0
        while completed < len(video_paths):
            progress_queue.get()  # Wait for progress notification
            completed += 1
            pbar.update(1)

    # Wait for all processes to finish
    for process in processes:
        process.join()


In [None]:
if __name__ == "__main__":
    import multiprocessing
    multiprocessing.set_start_method('spawn', True)

    json_dir = os.path.join(new_path, 'second_stage_video_filtered_data_dict.json')

    with open(json_dir, "r") as f:
        video_dict = json.load(f)

    video_keys = list(video_dict.keys())

    # Random sample 4K videos
    import random
    random.seed(42)
    video_keys = random.sample(video_keys, 4000)
    new_path = '/mnt/carpedkm_data/image_gen_ds/second_stage_video_train'
    # save the sampled video json file
    with open(os.path.join(new_path, 'second_stage_video_filtered_data_dict_sampled_4k.json'), 'w') as f:
        json.dump({k: video_dict[k] for k in video_keys}, f)
        
    video_dir = '/mnt/video_data/'
    vae_model_path = "THUDM/CogVideoX-5b"
    output_dir = "/mnt/carpedkm_data/image_gen_ds/second_stage_video_train_pexels"
    video_paths = [str(Path(video_dir) / f"{video_key}.mp4") for video_key in video_keys]
    extract_vae_latents(
            video_dir,
            video_keys,
            vae_model_path,
            output_dir,
            height=480,
            width=720,
            max_frames=25,
            fps=16,
            video_dict=video_dict,
        )

Using GPUs: [0, 1, 2, 3]


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/conda/envs/ptca/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/conda/envs/ptca/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'process_video' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/conda/envs/ptca/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/conda/envs/ptca/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'process_video' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/conda/envs/ptca/lib/python3.10/multiprocessing/

KeyboardInterrupt: 