# CogVideoX Feature Extraction

## 1. Setup and Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
import os
import json
from datetime import datetime
from torchvision import transforms
import numpy as np
from diffusers import CogVideoXPipeline
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f"Using device: {device}")

def show_frame(tensor_frame):
    """Display a tensor frame for debugging purposes."""
    if tensor_frame.dim() == 4:  
        tensor_frame = tensor_frame.squeeze(0)
    img = tensor_frame.to(torch.float32).cpu().numpy()
    img = (img * 0.5 + 0.5).clip(0, 1) 
    plt.figure(figsize=(10, 6))
    plt.imshow(img.transpose(1, 2, 0))
    plt.axis('off')
    plt.show()

def print_tensor_info(tensor, name="Tensor"):
    if not isinstance(tensor, torch.Tensor):
        print(f"{name} is not a tensor, but a {type(tensor)}")
        return
    print(f"{name}:")
    print(f"  Shape: {tensor.shape}")
    print(f"  Type: {tensor.dtype}")
    print(f"  Device: {tensor.device}")
    print(f"  Value range: [{tensor.min().item():.2f}, {tensor.max().item():.2f}]")

def print_module_info(module, name="Module"):
    print(f"\n{name} Information:")
    print(f"Type: {type(module)}")
    print(f"Device: {next(module.parameters()).device}")
    print(f"Parameter dtype: {next(module.parameters()).dtype}")
    total_params = sum(p.numel() for p in module.parameters())
    print(f"Total parameters: {total_params:,}")
    
    print("\nKey attributes:")
    for attr_name in dir(module):
        if not attr_name.startswith('_'):
            try:
                attr = getattr(module, attr_name)
                if isinstance(attr, (int, float, str, bool, torch.dtype)):
                    print(f"  {attr_name}: {attr}")
            except:
                pass

def clear_memory():
    """Simple memory cleanup for both CPU and MPS."""
    import gc
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache() 
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()
    gc.collect()

def save_features(features_dict, save_dir):
    """
    Save extracted features to disk in a structured format:
        save_dir/
          ├── cogvideox_vae/
          │   ├── low_features.pt
          │   ├── mid_features.pt
          │   ├── high_features.pt
          │   └── latents.pt
          └── metadata.json
    """
    
    os.makedirs(save_dir, exist_ok=True)
    
    # Save VAE features
    vae_features = features_dict['vae_features']
    for level, tensor in vae_features.items():
        save_path = os.path.join(save_dir, f"{level}_features.pt")
        torch.save({
            'tensor': tensor,
            'dtype': str(tensor.dtype),
            'device': str(tensor.device),
            'shape': tensor.shape
        }, save_path)
    
    # Save latents
    latents_path = os.path.join(save_dir, "latents.pt")
    torch.save({
        'tensor': features_dict['latents'],
        'dtype': str(features_dict['latents'].dtype),
        'device': str(features_dict['latents'].device),
        'shape': features_dict['latents'].shape
    }, latents_path)
    
    # Save metadata about the extraction
    metadata = {
        'extraction_date': datetime.now().isoformat(),
        'feature_shapes': {
            'conv_in': list(vae_features['conv_in'].shape),
            'down_block0': list(vae_features['down_block0'].shape),
            'down_block1': list(vae_features['down_block1'].shape),
            'down_block2': list(vae_features['down_block2'].shape),
            'down_block3': list(vae_features['down_block3'].shape),
            'mid_block': list(vae_features['mid_block'].shape),
            'latents': list(features_dict['latents'].shape)
        },
        'dtypes': {
            'conv_in': str(vae_features['conv_in'].dtype),
            'down_block0': str(vae_features['down_block0'].dtype),
            'down_block1': str(vae_features['down_block1'].dtype),
            'down_block2': str(vae_features['down_block2'].dtype),
            'down_block3': str(vae_features['down_block3'].dtype),
            'mid_block': str(vae_features['mid_block'].dtype),
            'latents': str(features_dict['latents'].dtype)
        }
    }


    metadata_path = os.path.join(save_dir, "metadata.json")
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"\nFeatures saved successfully in {save_dir}")
    print("\nSaved files:")
    for root, _, files in os.walk(save_dir):
        for file in files:
            file_path = os.path.join(root, file)
            size_mb = os.path.getsize(file_path) / (1024 * 1024)
            print(f"  {os.path.relpath(file_path, save_dir):<30} {size_mb:.2f} MB")


def load_features(save_dir):
    """
    Load previously saved features from disk.
    """
    
    with open(os.path.join(save_dir, "metadata.json"), 'r') as f:
        metadata = json.load(f)
    
    vae_features = {}
    for level in ['conv_in', 'down_block0', 'down_block1', 'down_block2', 'down_block3', 'mid_block']:
        path = os.path.join(save_dir, f"{level}_features.pt")
        data = torch.load(path)
        vae_features[level] = data['tensor']
    
    latents_data = torch.load(os.path.join(save_dir, "latents.pt"))
    latents = latents_data['tensor']
    
    return {
        'vae_features': vae_features,
        'latents': latents,
        'metadata': metadata
    }

class VideoFrameLoader:
    def __init__(self, frame_dir, dtype=torch.float16): 
        """Initialize frame loader with CogVideoX-specific dimensions."""
        self.frame_dir = frame_dir
        self.frame_files = sorted([f for f in os.listdir(frame_dir) if f.endswith('.jpg')])
        self.dtype = dtype
        
        self.transform = transforms.Compose([
            transforms.Resize((480, 720), antialias=True),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        ])
    
    def load_frames(self, num_frames=None, verbose=True):
        if num_frames is None:
            num_frames = len(self.frame_files)
        
        if verbose:
            print(f"Loading {num_frames} frames from {self.frame_dir}")
        
        frames = []
        for i in tqdm(range(min(num_frames, len(self.frame_files))), disable=not verbose):
            img_path = os.path.join(self.frame_dir, self.frame_files[i])
            img = Image.open(img_path).convert('RGB')
            frames.append(self.transform(img))
        
        frames_tensor = torch.stack(frames)
        frames_tensor = frames_tensor.to(self.dtype)
        
        if verbose:
            print(f"Loaded frames tensor with shape: {frames_tensor.shape}")
            show_frame(frames_tensor[0]) 
        return frames_tensor

## 2. VAE

In [None]:
class VAEFeatureExtractor:
    """
    Hooks the VAE encoder to get low/mid/high features, plus latents from `vae.encode(...)`.
    """
    def __init__(self, pipeline, device='mps'):
        self.vae = pipeline.vae.to(device)
        self.device = device
        self.features = {}
        self.hook_handles = []

        self.target_layers = {
            'conv_in': ['encoder.conv_in'],
            'down_block0': ['encoder.down_blocks.0'],
            'down_block1': ['encoder.down_blocks.1'],
            'down_block2': ['encoder.down_blocks.2'],
            'down_block3': ['encoder.down_blocks.3'],
            'mid_block': ['encoder.mid_block']
        }
        self._register_hooks()
        
    def _hook_fn(self, layer_name):
        def hook(module, inp, out):
            if isinstance(out, tuple):
                feat = out[0]
            else:
                feat = out
            if isinstance(feat, torch.Tensor):
                self.features[layer_name] = feat.detach()
            else:
                print(f"Warning: {layer_name} output is not a tensor but {type(feat)}")
        return hook
    
    def _register_hooks(self):
        for level, layers in self.target_layers.items():
            for layer_name in layers:
                module = self.vae
                for part in layer_name.split('.'):
                    module = getattr(module, part)
                handle = module.register_forward_hook(self._hook_fn(layer_name))
                self.hook_handles.append(handle)

    @torch.no_grad()
    def extract_features(self, frames, verbose=True):
        """
        frames shape: [T, 3, 480, 720]
        reshape to [B=1, C=3, T, H=480, W=720] for VAE encoder.
        """
        frames = frames.to(self.device)
        if verbose:
            print_tensor_info(frames, "Input frames")
        

        #need  T×H×W×C
        frames_5d = frames.unsqueeze(0).permute(0, 2, 1, 3, 4).contiguous()  
        if verbose:
            print_tensor_info(frames_5d, "Reshaped frames for VAE encoder")
        
        # forward pass
        encoder_output = self.vae.encode(frames_5d)
        if verbose:
            print(f"Encoder output type: {type(encoder_output)}")
        latent_dist = encoder_output[0]
        if verbose:
            print(f"Latent distribution type: {type(latent_dist)}")
        latents = latent_dist.sample()
        if verbose:
            print_tensor_info(latents, "Sampled latents")

        # gather the features
        features_dict = {}
        for level, layer_names in self.target_layers.items():
            for ln in layer_names:
                if ln in self.features:
                    features_dict[level] = self.features[ln]
                    if verbose:
                        print(f"\n{level}-level features from {ln}:")
                        print_tensor_info(features_dict[level], f"{level} features")
        
        return features_dict, latents
    
def extract_and_save_vae_features(frame_dir, save_dir, num_frames=None, device='mps'):
    print("\n=== PHASE 1: Extracting VAE Encoder Features ===")
    pipeline  = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)

    # Load frames
    loader = VideoFrameLoader(frame_dir, dtype=torch.float16)
    frames = loader.load_frames(num_frames=num_frames)

    # Hook + Extract
    vae_extractor = VAEFeatureExtractor(pipeline, device)
    vae_features, latents = vae_extractor.extract_features(frames, verbose=True)

    # Remove hooks
    for h in vae_extractor.hook_handles:
        h.remove()

    # Save
    features_dict = {
        'vae_features': vae_features,
        'latents': latents
    }
    save_features(features_dict, save_dir)

    print(f"\n[PHASE 1 DONE] VAE features + latents saved to: {save_dir}")
    clear_memory()

frame_dir = "data/29"
save_dir = "features/29"

# Phase 1: Extract VAE encoder features from the real frames
extract_and_save_vae_features(
    frame_dir=frame_dir,
    save_dir=save_dir,
    num_frames=32,
    device=device
)

clear_memory()

print("\nFeature extraction phase 1 complete: VAE encoder features + latents")

## 3. Add Noise to Latents

In [16]:
def add_noise_to_latents(latents_path, save_dir, timestep=0, device='cpu'):
    """
    Adds a controlled amount of noise to VAE latents for diffusion processing.
    
    Args:
        latents_path: Path to the saved latents tensor
        save_dir: Directory to save the noisy latents
        timestep: Diffusion timestep (0 = minimal noise, 999 = pure noise)
        device: Computing device to use
    
    The noise schedule follows the standard diffusion process where:
    - At t=0, we have almost no noise (mostly original signal)
    - At t=999, we have pure noise
    We use t=0 for feature extraction since we want to preserve most of the original signal
    while still engaging the denoising behavior of the transformer.
    """
    # Load the original latents
    latents_data = torch.load(latents_path)
    latents = latents_data['tensor'].to(device)
    
    # Print original latents info
    print(f"Original latents shape: {latents.shape}")
    print(f"Original latents range: [{latents.min():.3f}, {latents.max():.3f}]")

    # Shortcut for diffusion noise with 1 timestep 
    alpha = torch.tensor(0.999).to(device)

    # Generate random noise with same shape as latents
    noise = torch.randn_like(latents)
    
    # Add noise following the diffusion equation
    noisy_latents = alpha * latents + (1-alpha) * noise
    
    print(f"Noisy latents range: [{noisy_latents.min():.3f}, {noisy_latents.max():.3f}]")
    
    # Save the noisy latents
    noisy_save_path = os.path.join(save_dir, 'noisy_latents.pt')
    torch.save({
        'tensor': noisy_latents,
        'dtype': str(noisy_latents.dtype),
        'device': str(noisy_latents.device),
        'shape': noisy_latents.shape,
        'timestep': timestep,
        'alpha': alpha.item(),
    }, noisy_save_path)
    
    print(f"\nNoisy latents saved to: {noisy_save_path}")
    print(f"Added noise at timestep {timestep}")
    print(f"Alpha bar (noise level): {alpha.item():.6f}")
    
    return noisy_latents

In [None]:
latents_path = "features/horsejump-high/cogvideox_vae/latents.pt"
save_dir = "features/horsejump-high/cogvideox_vae"
noisy_latents = add_noise_to_latents(latents_path, save_dir, timestep=0)

## 4. Transformer

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/21/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/21"
save_dir = "features/21"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A kitesurfer glides dynamically across turquoise waters with a mountainous coastline in the background. Harnessing the wind's power through control lines, they maintain a steady sideways stance while generating impressive spray patterns. As they carve through the water, their board creates a widening wake of white foam. The rider's body remains balanced and tensed, arms extended to steer the kite, while repeatedly lifting off the water's surface in controlled jumps. Their fluid movements demonstrate skilled maneuvering as they navigate the conditions, with each jump producing increasingly dramatic splashes and water patterns beneath them.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/20/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/20"
save_dir = "features/20"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="Three lab-coated figures stand in stylized poses on an artificial turf field, with a chain-link fence and trees visible behind them. All wear safety goggles and white attire, differentiated by their boots - one in white sneakers, one in dark brown boots, and one in tan boots. The leftmost person checks their phone while slightly swaying, creating subtle movements. The middle and rightmost figures maintain steady stances but exhibit small postural adjustments, their long hair gently moving in the breeze. Their positioning suggests a coordinated photoshoot, with each person's micro-movements contributing to a dynamic yet controlled composition against the outdoor setting.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/19/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/19"
save_dir = "features/19"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A person wearing tactical gear, including a dark jacket with red accents, slides down a thick rope while holding it with one hand and gripping what appears to be a tactical weapon in the other. They display a consistently bright, enthusiastic grin throughout their descent. As they move downward, bullet shells scatter around them, catching the warm lighting that illuminates the scene. The background features vertical metallic structures and industrial-looking elements. Their motion is dynamic and controlled, maintaining balance while smoothly descending, with their equipment and clothing showing subtle movement from the action.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/18/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/18"
save_dir = "features/18"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A cyclist in athletic wear and a cap performs maintenance on a road bike indoors. Standing barefoot, they begin by examining the pedals and drivetrain, leaning in close to inspect the components. Their movements are methodical as they rotate the cranks and check the chain tension. Next, they turn their attention to the front wheel, carefully lifting and spinning it to assess alignment. The cyclist continues their inspection by examining both wheels thoroughly, demonstrating careful attention to detail throughout the maintenance routine. The scene takes place in a room with wooden flooring, brick walls, and white stairs in the background.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/17/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/17"
save_dir = "features/17"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A cream-colored dog with a fluffy coat explores an outdoor area with dry terrain and scattered vegetation. The dog moves deliberately across the ground, keeping its nose low while walking steadily forward. Its tail is held upright as it appears to be tracking or investigating something of interest. The dog maintains a focused, alert posture throughout its movement, suggesting it may be following a scent trail. The surroundings include a mesh fence in the background and some green foliage along the edges of the space. A red harness or collar is visible on the dog's body as it continues its methodical investigation of the area.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/16/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/16"
save_dir = "features/16"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A black swan glides gracefully through dark green water, creating gentle ripples as it moves. The swan's distinctive features are prominently displayed - its elongated neck held in an elegant S-curve, bright red beak contrasting with its deep ebony plumage, and feathers arranged in a textured pattern across its body. The background consists of leafy green foliage hanging over a concrete embankment. The swan maintains a steady, smooth swimming motion while keeping its head position relatively stable, demonstrating the characteristic poise and grace these waterfowl are known for. Reflections dance on the water's surface as the bird propels itself forward.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/15/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/15"
save_dir = "features/15"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A dromedary camel walks steadily along a sandy enclosure bordered by wooden fencing and green vegetation. The camel's movements reveal a distinctive gait pattern as it strides forward, lifting and placing each leg in a deliberate rhythm. Its beige-colored body sways gently with each step, while the prominent single hump maintains balance during locomotion. The camel's long neck extends forward as it moves, and its tail occasionally twitches. The lighting casts clear shadows beneath the animal as it traverses the enclosure, highlighting the natural, fluid motion of its muscular legs and characteristic pacing style.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/14/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/14"
save_dir = "features/14"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A paraglider prepares for takeoff from a grassy mountainside overlooking a sprawling town and forested slopes below. Wearing gray protective gear, a helmet, and carrying a large backpack, they progressively build momentum by running forward while handling the control lines. As they gain speed, their movements become more dynamic, transitioning from a steady jog to increasingly powerful strides. The running motion intensifies until they achieve sufficient velocity, at which point they leap forward, their feet leaving the ground as the paragliding canopy lifts them into the air, beginning their descent toward the valley below.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/13/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/13"
save_dir = "features/13"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="Five goldfish swim gracefully in a blue-lit aquarium environment, with one particularly distinctive fish having more prominent eyes and rounder features than its companions. The goldfish exhibit smooth, fluid movements as they navigate around aquatic plants and decorative elements, their orange-gold scales shimmering against the deep blue background. Each fish maintains its own swimming pattern - some staying higher in the tank, others swimming in the middle space, creating natural layers of movement. Their fins and tails flow elegantly as they glide through the water, demonstrating the characteristic fluid motion of healthy, active goldfish in their aquatic habitat.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/12/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/12"
save_dir = "features/12"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A brown and white spotted cow stands in a grassy field with patches of bare muddy ground. The cow wears a dark collar with what appears to be a bell attached. The animal moves deliberately forward along a path marked by wooden posts and a thin wire or rope. The lighting remains consistent throughout, highlighting the cow's distinctive coat pattern and casting shadows beneath it. The ground shows signs of wear and use, with a mix of grass on one side and exposed dirt on the path where the cow walks. In the lower corner, a partial view of what seems to be a container or equipment with orange markings is visible.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/11/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/11"
save_dir = "features/11"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="Three pigs forage together on dirt ground - a large black and white adult sow flanked by two smaller piglets, one brown and one white with brown spots. The brown piglet stays close to the middle while the spotted one explores on the left. All three pigs move methodically forward, keeping their snouts low to the ground as they search for food. The adult sow leads the way with purposeful steps, her distinctive black body contrasting with white patches around her face and legs. Their movements are synchronized and deliberate as they investigate the terrain together.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/10/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/10"
save_dir = "features/10"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="People walk along a sunlit street in what appears to be India, with several women wearing traditional attire including salwar kameez and sarees in vibrant colors like yellow, pink, and red. A woman in a brown and pink outfit maintains a steady pace in the center, carrying bags as she moves forward. Behind her, motorcycles are parked while pedestrians mill about. The scene has a warm, golden atmosphere created by late afternoon sunlight filtering through trees lining the street. Signs are visible on poles, and parked cars line one side of the road as people continue their daily activities.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/9/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/9"
save_dir = "features/9"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A person wearing a gray t-shirt and dark pants performs a parkour vault movement over a concrete wall. They approach the wall with momentum, placing their hands on the top surface while lifting their body upward in a fluid motion. As they elevate, their legs tuck close to their body before extending outward in a controlled manner. The movement transitions into a precise landing position where they maintain balance while absorbing the impact. The urban setting includes metal railings, stepped pathways, and apartment buildings in the background, all under a clear blue sky.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/8/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/8"
save_dir = "features/8"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A person wearing a yellow jacket and khaki pants stands in a grassy field under cloudy skies, accompanied by two dogs - a larger black and white dog and a smaller tricolor dog. The person trains the dogs using treats, with both canines attentively watching and following commands. The smaller dog performs tricks including standing on its hind legs and jumping up, while the larger dog observes. Throughout their training session, the person maintains a consistent stance while dispensing treats and giving commands, demonstrating a well-practiced routine with their responsive and eager canine companions against the backdrop of a serene rural landscape.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/7/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/7"
save_dir = "features/7"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="Two people ride a custom-built cart with a bright blue base and a large cylindrical tank mounted on top as they approach and traverse a wooden ramp installed on a residential street. The driver wears a helmet while a passenger stands behind them. The ramp has yellow-painted panels supported by green and orange metal struts. As the cart moves forward, it maintains steady momentum to successfully jump the ramp while spectators watch from the sidewalk, including a person in red and others observing the stunt. The cart appears to achieve a smooth takeoff and landing during its trajectory over the ramp.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/6/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/6"
save_dir = "features/6"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="Two martial artists in white uniforms engage in a judo throw technique on a green training mat while others observe from the edges. The executing practitioner initiates by gripping their partner's uniform, establishing a close standing position. Through a fluid combination of pulling and rotating movements, they lift and pivot their partner off balance, causing them to become airborne in a forward flip motion. The throw culminates with the partner landing on their back on the mat while the executing practitioner maintains control through the grip, following through to a final ground position to complete the throwing technique.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/5/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/5"
save_dir = "features/5"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A red drift car performs a controlled slide across a race track, leaving dark tire marks on the concrete surface. The car maintains a precise sideways angle while navigating along a barrier lined with alternating red and white safety barriers. In the background, white canopy tents and parked vehicles are visible, along with a sign. The track appears to be part of a motorsport venue with spectator areas marked by green and yellow barriers. The motion is fluid and deliberate, demonstrating skilled car control as the driver executes the drift maneuver with consistent speed and angle throughout the run.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/4/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/4"
save_dir = "features/4"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A drift car navigates a curved track at the Osaka Maishima Sports Island, demonstrating precise car control through controlled sliding. The red and white vehicle enters the corner while initiating a drift, generating thick white smoke from the tires as it maintains a sideways angle through the turn. The driver skillfully modulates the throttle and steering to maintain the optimal drift angle, with tire smoke billowing consistently throughout the maneuver. The car's trajectory follows the curve of the track marked by white lines, while spectators and event infrastructure, including display screens and temporary structures, are visible in the background.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/3/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/3"
save_dir = "features/3"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A breakdancer performs a dynamic handstand sequence in front of an engaged crowd on an outdoor wooden platform, with ornate building facades as the backdrop. The dancer, wearing a red sweatshirt and light jeans, begins with a controlled handstand, then transitions through a series of fluid rotations and spins while balanced on their hands. The crowd, consisting of young people in casual streetwear, forms a semicircle around the performance space, watching intently as the dancer executes complex breaking movements. The sunlit scene captures the athletic precision and artistic expression of street dance culture.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/2/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/2"
save_dir = "features/2"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A dirt bike rider performs a dramatic jump sequence on an off-road track surrounded by pine forest. Wearing bright protective gear with a neon helmet, the rider approaches the jump with speed, launches into the air, and executes a series of aerial maneuvers. The green motorcycle tilts upward as it gains height, revealing the underside of the bike and demonstrating the rider's control. The sequence captures multiple angles of the airborne bike, from initial takeoff through peak elevation, showcasing both technical skill and the dynamic relationship between rider and machine in motocross sport.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/1/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/1"
save_dir = "features/1"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A gray compact SUV moves steadily along an urban street, passing through a curved section of road. The vehicle maintains a consistent speed as it navigates past historic buildings with tan facades and large columns. The scene is set in what appears to be a European city, with parked cars lining the street and yellow flowers blooming in raised planters near a large stone building. A blue directional arrow sign is visible in the background, and the lighting suggests it's during daytime. The SUV's black wheels and metallic paint glisten as it progresses through the gentle curve of the street.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

In [None]:
class TransformerFeatureExtractor:
    def __init__(self, pipeline, block_indices):
        self.transformer = pipeline.transformer
        self.block_indices = block_indices
        self.features = {} 
        self.hook_handles = []
        self._register_hooks()
        self._register_final_output_hook()

    def _register_hooks(self):
        for idx, block in enumerate(self.transformer.transformer_blocks):
            if idx in self.block_indices:
                handle = block.register_forward_hook(self._make_hook_fn(idx))
                self.hook_handles.append(handle)

    def _register_final_output_hook(self):
        def final_hook(module, inp, out):
            if isinstance(out, tuple):
                # If return_dict=True, output is in .sample
                final_output = out[0] if not hasattr(out, 'sample') else out.sample
            else:
                final_output = out
            self.features['final_unpatchified'] = final_output.detach()
        
        # Register hook on the transformer itself
        handle = self.transformer.register_forward_hook(final_hook)
        self.hook_handles.append(handle)


    def _make_hook_fn(self, block_idx):
        def hook_fn(module, inp, out):
            hidden_states, enc_hidden_states = out
            #hidden_states = hidden_states.detach().cpu()
            #enc_hidden_states = enc_hidden_states.detach().cpu()
            self.features[f"block_{block_idx}_hidden"] = hidden_states
            self.features[f"block_{block_idx}_enc"] = enc_hidden_states
        return hook_fn
    
    def remove_hooks(self):
        for h in self.hook_handles:
            h.remove()

    def clear_features(self):
        self.features = {}

@torch.no_grad()
def extract_and_save_transformer_features_single_step(
    save_dir,
    prompt,
    block_indices,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    device
):

    latents = torch.load("features/0/latents.pt", weights_only=True)['tensor']
    latents = latents.permute(0, 2, 1, 3, 4)
    latents = latents.to(device=device, dtype=torch.float16)

    pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to(device)
    pipeline.transformer.eval()

    # Hook the transformer blocks
    trans_hook = TransformerFeatureExtractor(pipeline, block_indices)

    # Short pipeline call with 1 step
    try:
        result = pipeline(
            prompt=prompt,
            height=height,
            width=width,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            latents = latents,
            return_dict=True
        )

        transformer_features = {}
        for key, feat in trans_hook.features.items():
            transformer_features[key] = feat.cpu()

        out_path = os.path.join(save_dir, "cogvideox_features.pt")
        torch.save(transformer_features, out_path)
        print(f"Saved transformer features to: {out_path}")

    finally:
        trans_hook.remove_hooks()
        trans_hook.clear_features()
        clear_memory()

    return transformer_features

frame_dir = "data/0"
save_dir = "features/0"

# Phase 2: Extract Transformer features with a single-step pipeline call
single_step_features = extract_and_save_transformer_features_single_step(
    save_dir=save_dir,
    prompt="A mountain goat with distinctive long gray and tan fur traverses rocky terrain along a steep mountainside. The goat demonstrates remarkable agility as it carefully steps across large pink-hued boulders, maintaining perfect balance despite the precarious elevation. Its thick coat sways gently with each deliberate movement, while the rugged landscape of scattered vegetation and exposed rock formations creates a dramatic backdrop. The goat's sure-footed navigation showcases its natural adaptation to mountainous environments, as it confidently moves across the rocky outcrop while surveying the surrounding slopes and valleys below.",
    block_indices=(0,7,14,21,29),
    num_inference_steps=1,
    guidance_scale=1.0,
    height=480,
    width=720,
    num_frames=32,
    device=device
)

print("\nFeature extraction phase 2 complete: Transformer features from single diffusion step")

0:
A mountain goat with distinctive long gray and tan fur traverses rocky terrain along a steep mountainside. The goat demonstrates remarkable agility as it carefully steps across large pink-hued boulders, maintaining perfect balance despite the precarious elevation. Its thick coat sways gently with each deliberate movement, while the rugged landscape of scattered vegetation and exposed rock formations creates a dramatic backdrop. The goat's sure-footed navigation showcases its natural adaptation to mountainous environments, as it confidently moves across the rocky outcrop while surveying the surrounding slopes and valleys below.


1:
A gray compact SUV moves steadily along an urban street, passing through a curved section of road. The vehicle maintains a consistent speed as it navigates past historic buildings with tan facades and large columns. The scene is set in what appears to be a European city, with parked cars lining the street and yellow flowers blooming in raised planters near a large stone building. A blue directional arrow sign is visible in the background, and the lighting suggests it's during daytime. The SUV's black wheels and metallic paint glisten as it progresses through the gentle curve of the street.


2:
A dirt bike rider performs a dramatic jump sequence on an off-road track surrounded by pine forest. Wearing bright protective gear with a neon helmet, the rider approaches the jump with speed, launches into the air, and executes a series of aerial maneuvers. The green motorcycle tilts upward as it gains height, revealing the underside of the bike and demonstrating the rider's control. The sequence captures multiple angles of the airborne bike, from initial takeoff through peak elevation, showcasing both technical skill and the dynamic relationship between rider and machine in motocross sport.


3:
A breakdancer performs a dynamic handstand sequence in front of an engaged crowd on an outdoor wooden platform, with ornate building facades as the backdrop. The dancer, wearing a red sweatshirt and light jeans, begins with a controlled handstand, then transitions through a series of fluid rotations and spins while balanced on their hands. The crowd, consisting of young people in casual streetwear, forms a semicircle around the performance space, watching intently as the dancer executes complex breaking movements. The sunlit scene captures the athletic precision and artistic expression of street dance culture.

4:
A drift car navigates a curved track at the Osaka Maishima Sports Island, demonstrating precise car control through controlled sliding. The red and white vehicle enters the corner while initiating a drift, generating thick white smoke from the tires as it maintains a sideways angle through the turn. The driver skillfully modulates the throttle and steering to maintain the optimal drift angle, with tire smoke billowing consistently throughout the maneuver. The car's trajectory follows the curve of the track marked by white lines, while spectators and event infrastructure, including display screens and temporary structures, are visible in the background.

5:
A red drift car performs a controlled slide across a race track, leaving dark tire marks on the concrete surface. The car maintains a precise sideways angle while navigating along a barrier lined with alternating red and white safety barriers. In the background, white canopy tents and parked vehicles are visible, along with a "KIDS" sign. The track appears to be part of a motorsport venue with spectator areas marked by green and yellow barriers. The motion is fluid and deliberate, demonstrating skilled car control as the driver executes the drift maneuver with consistent speed and angle throughout the run.

6:
Two martial artists in white uniforms engage in a judo throw technique on a green training mat while others observe from the edges. The executing practitioner initiates by gripping their partner's uniform, establishing a close standing position. Through a fluid combination of pulling and rotating movements, they lift and pivot their partner off balance, causing them to become airborne in a forward flip motion. The throw culminates with the partner landing on their back on the mat while the executing practitioner maintains control through the grip, following through to a final ground position to complete the throwing technique.

7:
Two people ride a custom-built cart with a bright blue base and a large cylindrical tank mounted on top as they approach and traverse a wooden ramp installed on a residential street. The driver wears a helmet while a passenger stands behind them. The ramp has yellow-painted panels supported by green and orange metal struts. As the cart moves forward, it maintains steady momentum to successfully jump the ramp while spectators watch from the sidewalk, including a person in red and others observing the stunt. The cart appears to achieve a smooth takeoff and landing during its trajectory over the ramp.

8:
A person wearing a yellow jacket and khaki pants stands in a grassy field under cloudy skies, accompanied by two dogs - a larger black and white dog and a smaller tricolor dog. The person trains the dogs using treats, with both canines attentively watching and following commands. The smaller dog performs tricks including standing on its hind legs and jumping up, while the larger dog observes. Throughout their training session, the person maintains a consistent stance while dispensing treats and giving commands, demonstrating a well-practiced routine with their responsive and eager canine companions against the backdrop of a serene rural landscape.

9:
A person wearing a gray t-shirt and dark pants performs a parkour vault movement over a concrete wall. They approach the wall with momentum, placing their hands on the top surface while lifting their body upward in a fluid motion. As they elevate, their legs tuck close to their body before extending outward in a controlled manner. The movement transitions into a precise landing position where they maintain balance while absorbing the impact. The urban setting includes metal railings, stepped pathways, and apartment buildings in the background, all under a clear blue sky.

10:
People walk along a sunlit street in what appears to be India, with several women wearing traditional attire including salwar kameez and sarees in vibrant colors like yellow, pink, and red. A woman in a brown and pink outfit maintains a steady pace in the center, carrying bags as she moves forward. Behind her, motorcycles are parked while pedestrians mill about. The scene has a warm, golden atmosphere created by late afternoon sunlight filtering through trees lining the street. Signs are visible on poles, and parked cars line one side of the road as people continue their daily activities.

11:
Three pigs forage together on dirt ground - a large black and white adult sow flanked by two smaller piglets, one brown and one white with brown spots. The brown piglet stays close to the middle while the spotted one explores on the left. All three pigs move methodically forward, keeping their snouts low to the ground as they search for food. The adult sow leads the way with purposeful steps, her distinctive black body contrasting with white patches around her face and legs. Their movements are synchronized and deliberate as they investigate the terrain together.

12:
A brown and white spotted cow stands in a grassy field with patches of bare muddy ground. The cow wears a dark collar with what appears to be a bell attached. The animal moves deliberately forward along a path marked by wooden posts and a thin wire or rope. The lighting remains consistent throughout, highlighting the cow's distinctive coat pattern and casting shadows beneath it. The ground shows signs of wear and use, with a mix of grass on one side and exposed dirt on the path where the cow walks. In the lower corner, a partial view of what seems to be a container or equipment with orange markings is visible.

13:
Five goldfish swim gracefully in a blue-lit aquarium environment, with one particularly distinctive fish having more prominent eyes and rounder features than its companions. The goldfish exhibit smooth, fluid movements as they navigate around aquatic plants and decorative elements, their orange-gold scales shimmering against the deep blue background. Each fish maintains its own swimming pattern - some staying higher in the tank, others swimming in the middle space, creating natural layers of movement. Their fins and tails flow elegantly as they glide through the water, demonstrating the characteristic fluid motion of healthy, active goldfish in their aquatic habitat.

14:
A paraglider prepares for takeoff from a grassy mountainside overlooking a sprawling town and forested slopes below. Wearing gray protective gear, a helmet, and carrying a large backpack, they progressively build momentum by running forward while handling the control lines. As they gain speed, their movements become more dynamic, transitioning from a steady jog to increasingly powerful strides. The running motion intensifies until they achieve sufficient velocity, at which point they leap forward, their feet leaving the ground as the paragliding canopy lifts them into the air, beginning their descent toward the valley below.

15:
A dromedary camel walks steadily along a sandy enclosure bordered by wooden fencing and green vegetation. The camel's movements reveal a distinctive gait pattern as it strides forward, lifting and placing each leg in a deliberate rhythm. Its beige-colored body sways gently with each step, while the prominent single hump maintains balance during locomotion. The camel's long neck extends forward as it moves, and its tail occasionally twitches. The lighting casts clear shadows beneath the animal as it traverses the enclosure, highlighting the natural, fluid motion of its muscular legs and characteristic pacing style.

16:
A black swan glides gracefully through dark green water, creating gentle ripples as it moves. The swan's distinctive features are prominently displayed - its elongated neck held in an elegant S-curve, bright red beak contrasting with its deep ebony plumage, and feathers arranged in a textured pattern across its body. The background consists of leafy green foliage hanging over a concrete embankment. The swan maintains a steady, smooth swimming motion while keeping its head position relatively stable, demonstrating the characteristic poise and grace these waterfowl are known for. Reflections dance on the water's surface as the bird propels itself forward.

17:
A cream-colored dog with a fluffy coat explores an outdoor area with dry terrain and scattered vegetation. The dog moves deliberately across the ground, keeping its nose low while walking steadily forward. Its tail is held upright as it appears to be tracking or investigating something of interest. The dog maintains a focused, alert posture throughout its movement, suggesting it may be following a scent trail. The surroundings include a mesh fence in the background and some green foliage along the edges of the space. A red harness or collar is visible on the dog's body as it continues its methodical investigation of the area.

18:
A cyclist in athletic wear and a cap performs maintenance on a road bike indoors. Standing barefoot, they begin by examining the pedals and drivetrain, leaning in close to inspect the components. Their movements are methodical as they rotate the cranks and check the chain tension. Next, they turn their attention to the front wheel, carefully lifting and spinning it to assess alignment. The cyclist continues their inspection by examining both wheels thoroughly, demonstrating careful attention to detail throughout the maintenance routine. The scene takes place in a room with wooden flooring, brick walls, and white stairs in the background.

19:
A person wearing tactical gear, including a dark jacket with red accents, slides down a thick rope while holding it with one hand and gripping what appears to be a tactical weapon in the other. They display a consistently bright, enthusiastic grin throughout their descent. As they move downward, bullet shells scatter around them, catching the warm lighting that illuminates the scene. The background features vertical metallic structures and industrial-looking elements. Their motion is dynamic and controlled, maintaining balance while smoothly descending, with their equipment and clothing showing subtle movement from the action.

20:
Three lab-coated figures stand in stylized poses on an artificial turf field, with a chain-link fence and trees visible behind them. All wear safety goggles and white attire, differentiated by their boots - one in white sneakers, one in dark brown boots, and one in tan boots. The leftmost person checks their phone while slightly swaying, creating subtle movements. The middle and rightmost figures maintain steady stances but exhibit small postural adjustments, their long hair gently moving in the breeze. Their positioning suggests a coordinated photoshoot, with each person's micro-movements contributing to a dynamic yet controlled composition against the outdoor setting.

21:
A kitesurfer glides dynamically across turquoise waters with a mountainous coastline in the background. Harnessing the wind's power through control lines, they maintain a steady sideways stance while generating impressive spray patterns. As they carve through the water, their board creates a widening wake of white foam. The rider's body remains balanced and tensed, arms extended to steer the kite, while repeatedly lifting off the water's surface in controlled jumps. Their fluid movements demonstrate skilled maneuvering as they navigate the conditions, with each jump producing increasingly dramatic splashes and water patterns beneath them.

22:
A young cyclist rides a red bicycle along a straight path parallel to a colorful graffiti wall. Wearing a white t-shirt and patterned shorts, they maintain a steady pedaling rhythm as they cruise past the vibrant street art featuring bold orange, yellow, and green designs with stylized characters. The scene is framed by trees, with tall grass growing on the slope beside the wall. The motion blur in the images creates a sense of forward movement, while the background artwork provides a striking urban backdrop for the casual bike ride.

23:
A dancer performs a contemporary solo on an outdoor stage with a curved wall of ornamental grasses and hay bales as the backdrop. She wears a navy blue dress with flowing gray fabric attachments and dances barefoot before a seated audience. Her movements flow from standing poses into dynamic spins, with her arms gracefully extending outward and upward. She shifts her weight between feet while executing controlled turns, letting the dress fabric create sweeping patterns through the air. Her choreography includes subtle head tilts, fluid arm circles, and balanced poses that transition smoothly into swift directional changes, all while maintaining an elegant performance presence.

24:
A silver compact car navigates through an urban intersection bordered by light-colored buildings and white bollards. Starting from the right side, the vehicle smoothly curves leftward while maintaining a steady pace. As it progresses through its turn, the car's body tilts slightly due to the cornering forces, while its wheels rotate steadily against the dark asphalt. The vehicle continues its fluid arc across the frame, gradually straightening its trajectory as it approaches a pedestrian crossing. Throughout its motion, the car maintains consistent speed and exhibits stable handling characteristics while executing the left turn maneuver, eventually beginning to level out as it prepares to continue straight ahead.

25:
A small dog with distinctive black and tan coloring moves purposefully through a garden setting, passing between green metal fence posts. The dog maintains a steady trotting gait, demonstrating agile movement as it navigates around trees and past purple flowering plants. Its ears are perked upward in an alert position, and its tail is carried high while moving. The dog's muscular legs show a coordinated walking pattern, with its body remaining level and balanced throughout the motion. The background features lush greenery, including mature trees and ornamental shrubs in a fenced yard space.

26:
A motorcyclist in a navy suit and white helmet rides past a line of parked motorcycles on a sunny urban street with classical architecture featuring arches and balconies. The rider maintains a steady forward motion while seated upright, smoothly progressing from right to left across the view. Their movement appears graceful and controlled as they navigate the wide road, passing both stationary motorcycles on one side and a silver station wagon on the other. Green trees frame the top of the scene, casting dappled shadows on the street as the rider continues their journey through the frame with unwavering momentum.

27:
A motorcyclist wearing a white helmet and shirt performs a continuous burnout maneuver on a paved area near a tunnel entrance. The rider maintains steady control as the rear tire spins rapidly against the ground, generating thick white smoke that billows outward. Dark tire marks spiral and interweave across the pavement, creating intricate circular patterns as the motorcycle pivots and rotates. The smoke intensifies and spreads wider while the rider skillfully balances the motorcycle, keeping the front wheel relatively stable as the rear continues its spinning motion. The surrounding environment includes a grassy slope and concrete barriers, remaining static throughout the action.

28:
In a retail store electronics section, a store employee wearing a gray vest and blue shirt interacts with a customer in a black jacket. The employee gestures and shifts his weight while communicating, his body language suggesting an explanatory conversation. The customer, positioned to the right, carefully examines and handles display items, lifting and inspecting boxes. Their interaction flows naturally as the customer reaches for products while the employee maintains an open, engaged stance, occasionally adjusting his position and using hand movements to emphasize points. The customer's movements are deliberate and focused, suggesting careful consideration of the merchandise while the employee maintains a helpful, attentive presence throughout their exchange.

29:
A rider in white attire and black helmet guides a chestnut horse through a show jumping course. The horse's approach begins with a controlled canter, gathering momentum as they near the decorated jump obstacle. As they reach the takeoff point, the horse powerfully propels upward, tucking its front legs while the rider maintains a forward position. At the jump's apex, both horse and rider demonstrate precise form before the horse extends its legs to clear the pole. During landing, the horse's neck stretches forward while its tail flows gracefully, absorbing the impact as they transition smoothly back into a canter to continue their course.
