In [1]:
import copy
import os
from dataclasses import dataclass, field
import sys
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
from diffusers import AutoencoderKL, DDPMScheduler
from diffusers.optimization import get_scheduler
from einops import rearrange
from tqdm.auto import tqdm
from transformers import AutoTokenizer, CLIPTextModel
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
from accelerate import Accelerator

# 假设我们有DiT模型定义，类似于diffusers中的UNet
# from my_models import DiTModel, DiTConfig # 这是一个假设的模型定义
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel, AutoencoderKLHunyuanVideo

  from .autonotebook import tqdm as notebook_tqdm
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
    PyTorch 2.5.1+cu121 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.10 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


In [11]:
from diffusers import FlowMatchEulerDiscreteScheduler
import json

# 读取 scheduler_config.json
scheduler_config_path = "scheduler_config.json"
with open(scheduler_config_path, "r") as f:
    scheduler_config = json.load(f)

# 用配置初始化 scheduler
scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)

In [24]:
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel, AutoencoderKLHunyuanVideo
vae = AutoencoderKLHunyuanVideo.from_config("HunyuanConfig/vae.json")

In [None]:
def _fm_from_pred_velocity_to_pred_video(model_output, noisy_latents, timestep):
    """
    from velocity to video
    """
    sigmas = scheduler.sigmas
    schedule_timesteps = scheduler.timesteps
    step_indices = [scheduler.index_for_timestep(t, schedule_timesteps) for t in timestep]
    sigma = sigmas[step_indices].flatten()
    
    # 调整 sigma 的维度用于广播
    while len(sigma.shape) < len(noisy_latents.shape):
        sigma = sigma.unsqueeze(-1)
        
    # x_clean_pred = x_t + t * v_pred
    pred_video = noisy_latents + sigma * model_output
    
    return pred_video

In [None]:
print(scheduler.sigmas)

In [18]:
model_output = torch.randn((1, 3, 16, 224, 224))
noisy_latents = torch.randn((1, 3, 16, 224, 224))

def _sample_timestep() -> torch.Tensor:
    """
    随机选择一个时间步，用于生成噪声，返回形状为 [batch_size] 的 tensor。
    """
    idx = torch.randint(0, len(scheduler.timesteps), (1,))
    sampled_timesteps = scheduler.timesteps[idx]
    return sampled_timesteps
timestep = _sample_timestep()
print(timestep)
x = _fm_from_pred_velocity_to_pred_video(model_output, noisy_latents, list(timestep))

tensor([846.6813])


In [None]:
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel, AutoencoderKLHunyuanVideo
from transformers import LlamaModel, CLIPTextModel
import torch
model_pipe = HunyuanVideoPipeline.from_pretrained(
            "hunyuanvideo-community/HunyuanVideo", 
            transformer=None,
            vae=None,
            text_encoder_2=None,
            tokenizer_2=None,
            torch_dtype=torch.bfloat16
        ).to("cuda")

text = "<PAD>"
num_hidden_layers_to_skip = 2
text_inputs = model_pipe.tokenizer(
    text, padding="max_length", max_length=10, return_tensors="pt"
)
text_input_ids = text_inputs.input_ids.to("cuda")
prompt_attention_mask = text_inputs.attention_mask.to("cuda")

prompt_embeds = model_pipe.text_encoder(
            input_ids=text_input_ids,
            attention_mask=prompt_attention_mask,
            output_hidden_states=True,
        ).hidden_states[-(num_hidden_layers_to_skip + 1)]


  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import pyarrow.parquet as pq

# 读取整个 parquet
table = pq.read_table("/work/hdd/bcjw/jcai2/dataset/mixkit-processed/combined_parquet_dataset/worker_0/data_chunk_0.parquet")

# 转成字典或 DataFrame
data = table.to_pydict()  
print(data.keys())  # 看有哪些字段
print(data["id"][0])  # 取第一条的 id


dict_keys(['id', 'vae_latent_bytes', 'vae_latent_shape', 'vae_latent_dtype', 'text_embedding_bytes', 'text_embedding_shape', 'text_embedding_dtype', 'pooled_text_embedding_bytes', 'pooled_text_embedding_shape', 'pooled_text_embedding_dtype', 'text_attention_mask_bytes', 'text_attention_mask_shape', 'text_attention_mask_dtype', 'file_name', 'caption', 'media_type', 'width', 'height', 'num_frames', 'duration_sec', 'fps'])
mixkit-industrial-machine-working-2230_clip_1


In [2]:
import pyarrow.parquet as pq
import numpy as np
import torch

def load_parquet_record(path, idx=0):
    """读取 parquet 文件中的第 idx 条记录，并还原所有张量"""
    table = pq.read_table(path)
    data = table.to_pydict()  # 转成 Python dict（列名 -> list）

    def restore_tensor(prefix):
        """根据字段前缀还原 numpy/tensor"""
        arr_bytes = data[f"{prefix}_bytes"][idx]
        arr_shape = data[f"{prefix}_shape"][idx]
        arr_dtype = data[f"{prefix}_dtype"][idx]
        np_array = np.frombuffer(arr_bytes, dtype=arr_dtype).reshape(arr_shape)
        return torch.from_numpy(np_array)  # 返回 torch.Tensor

    record = {
        "id": data["id"][idx],
        "caption": data["caption"][idx],
        "file_name": data["file_name"][idx],
        "media_type": data["media_type"][idx],
        "width": data["width"][idx],
        "height": data["height"][idx],
        "num_frames": data["num_frames"][idx],
        "duration_sec": data["duration_sec"][idx],
        "fps": data["fps"][idx],
        "vae_latent": restore_tensor("vae_latent"),
        "text_embedding": restore_tensor("text_embedding"),
        "pooled_text_embedding": restore_tensor("pooled_text_embedding"),
        "text_attention_mask": restore_tensor("text_attention_mask"),
    }
    return record

# 例子：读取第一条
record = load_parquet_record("/work/hdd/bcjw/jcai2/dataset/mixkit-processed/combined_parquet_dataset/worker_0/data_chunk_0.parquet", idx=0)
print(record["id"])
print(record["vae_latent"].shape, record["vae_latent"].dtype)
print(record["caption"])


mixkit-industrial-machine-working-2230_clip_1
torch.Size([16, 21, 60, 104]) torch.float32
The video captures an intricate close-up of a high-precision CNC machining tool operating with mechanical finesse. The tool head, prominently centered, rotates smoothly and rhythmically as it carefully hones a metal workpiece, demonstrating precise industrial craftsmanship. The metallic surface of the tool gleams under soft, diffuse lighting, highlighting its polished finish and utilitarian design. Shadows play across the surrounding machinery, adding depth and contrast to the industrial scene. Subtle reflections are visible on the pristine metal surface, emphasizing its high-quality engineering and maintenance. The cool, muted tones of the machining environment suggest a place of focused technical activity. This scene embodies a symphony of engineering precision and mechanical elegance, ideal for recreations geared towards industrial realism.


  return torch.from_numpy(np_array)  # 返回 torch.Tensor
