⚡️ Speed up method `LTXPipeline._pack_latents` by 5% by codeflash-ai[bot] · Pull Request #129 · aseembits93/diffusers

codeflash-ai · 2025-06-01T10:05:49Z

📄 5% (0.05x) speedup for `LTXPipeline._pack_latents` in `src/diffusers/pipelines/ltx/pipeline_ltx.py`

⏱️ Runtime : 9.27 milliseconds → 8.83 milliseconds (best of 170 runs)

📝 Explanation and details

Here is an optimized version of your program for better speed and memory. Main changes.

Avoid getattr with fallback: Using getattr inside init with a default fallback is not needed if you already have the argument. Use arg directly. This reduces Python attribute lookups and ensures early errors.
Remove unnecessary 'if getattr' defaults: If you always call the constructor with real objects, you don't need these checks.
Optimize _pack_latents: Refactor to use a single reshape and permute, with merged flatten, which is more explicit and minimizes intermediate objects. Compute shape ahead, avoid unnecessary -1 reshape argument (faster in PyTorch).
In init, precompute scalar attributes to local variables before repeated access.

You can try further JIT/CUDA-level improvements, but this is close to optimal for a pipeline utility function in pure Python and PyTorch.

Key Speedups.

Fewer PyTorch metadata computations.
No unnecessary lookups on self.X when argument is available.
Minimized reshaping, and clearer patching for compiler efficiency.

The pipeline logic and API remain unchanged.

✅ Correctness verification report:

Test	Status
⚙️ Existing Unit Tests	🔘 None Found
🌀 Generated Regression Tests	✅ 44 Passed
⏪ Replay Tests	🔘 None Found
🔎 Concolic Coverage Tests	🔘 None Found
📊 Tests Coverage	100.0%

🌀 Generated Regression Tests Details

import pytest  # used for our unit tests
import torch
from src.diffusers.pipelines.ltx.pipeline_ltx import LTXPipeline

# unit tests

# -------- BASIC TEST CASES --------

def test_pack_latents_identity_patchsize_1():
    # Basic test: patch_size and patch_size_t are 1, so output should be [B, F*H*W, C]
    latents = torch.arange(2*3*4*5*6).reshape(2,3,4,5,6).float()
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=1, patch_size_t=1); out = codeflash_output
    # Check that the first token in the sequence corresponds to the first (frame, h, w) with all channels
    # [B, C, F, H, W] -> [B, F*H*W, C]
    for b in range(2):
        for c in range(3):
            pass

def test_pack_latents_simple_patchsize_2():
    # Simple test: patch_size=2, patch_size_t=2
    latents = torch.arange(1*1*4*4*4).reshape(1,1,4,4,4).float()
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output
    # Each "token" in the output should correspond to a 2x2x2 patch
    # Let's check the first token: should include all values from frames 0-1, h 0-1, w 0-1
    patch = latents[0,0,0:2,0:2,0:2].reshape(-1)

def test_pack_latents_multiple_channels():
    # Test with more channels
    latents = torch.arange(2*4*2*2*2).reshape(2,4,2,2,2).float()
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output

def test_pack_latents_non_square():
    # Test with non-square spatial and temporal dims
    latents = torch.arange(1*1*6*4*2).reshape(1,1,6,4,2).float()
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=3); out = codeflash_output

def test_pack_latents_dtype_preservation():
    # Output dtype should match input dtype
    latents = torch.randn(1, 2, 4, 4, 4, dtype=torch.float64)
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output

# -------- EDGE TEST CASES --------

def test_pack_latents_patchsize_equals_dim():
    # patch_size == spatial/temporal dimension
    latents = torch.arange(1*1*2*2*2).reshape(1,1,2,2,2).float()
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output

def test_pack_latents_patchsize_larger_than_dim_raises():
    # patch_size or patch_size_t > dimension should raise
    latents = torch.zeros(1,1,4,4,4)
    with pytest.raises(RuntimeError):
        LTXPipeline._pack_latents(latents, patch_size=5, patch_size_t=1)
    with pytest.raises(RuntimeError):
        LTXPipeline._pack_latents(latents, patch_size=1, patch_size_t=5)

def test_pack_latents_patchsize_not_divisible_raises():
    # patch_size or patch_size_t not dividing dims should raise
    latents = torch.zeros(1,1,5,6,7)
    with pytest.raises(RuntimeError):
        LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=1)
    with pytest.raises(RuntimeError):
        LTXPipeline._pack_latents(latents, patch_size=1, patch_size_t=2)

def test_pack_latents_singleton_dims():
    # Singleton batch/channel/frame/height/width
    latents = torch.arange(1*1*1*2*2).reshape(1,1,1,2,2).float()
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=1); out = codeflash_output

def test_pack_latents_minimal_patch():
    # Minimal patch: everything is 1
    latents = torch.tensor([[[[[42.]]]]])  # shape (1,1,1,1,1)
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=1, patch_size_t=1); out = codeflash_output


def test_pack_latents_zero_patchsize_raises():
    latents = torch.zeros(1,1,4,4,4)
    with pytest.raises(ZeroDivisionError):
        LTXPipeline._pack_latents(latents, patch_size=0, patch_size_t=1)
    with pytest.raises(ZeroDivisionError):
        LTXPipeline._pack_latents(latents, patch_size=1, patch_size_t=0)

# -------- LARGE SCALE TEST CASES --------

def test_pack_latents_large_tensor():
    # Large tensor, but <100MB (e.g. 2*3*10*20*20 = 24,000 floats = ~96KB)
    latents = torch.randn(2, 3, 10, 20, 20)
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output

def test_pack_latents_large_batch():
    # Large batch dimension
    latents = torch.randn(50, 2, 4, 8, 8)
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output

def test_pack_latents_large_channels():
    # Large channel dimension
    latents = torch.randn(1, 64, 4, 4, 4)
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output

def test_pack_latents_large_spatial():
    # Large spatial dimensions, but still <100MB
    latents = torch.randn(1, 1, 2, 64, 64)
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=8, patch_size_t=2); out = codeflash_output

def test_pack_latents_large_temporal():
    # Large temporal dimension, but still <100MB
    latents = torch.randn(1, 1, 128, 2, 2)
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=16); out = codeflash_output

# -------- FUNCTIONALITY/INTEGRITY TESTS --------

def test_pack_latents_mutation_would_fail():
    # If the function changes the permutation order, the output will not match the expected
    latents = torch.arange(1*1*2*2*2).reshape(1,1,2,2,2).float()
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output
    # The correct output is a single token with all values in the original order
    expected = latents.flatten()

def test_pack_latents_permutation_integrity():
    # If the function skips the permute, the output will not match
    latents = torch.arange(1*1*2*2*2).reshape(1,1,2,2,2).float()
    # manually compute expected output using the function logic
    # For this shape and patch sizes, output should be latents.flatten()
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output

def test_pack_latents_gradient_preservation():
    # Ensure the output preserves requires_grad if input does
    latents = torch.randn(1,1,4,4,4, requires_grad=True)
    codeflash_output = LTXPipeline._pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

import pytest  # used for our unit tests
import torch
from src.diffusers.pipelines.ltx.pipeline_ltx import LTXPipeline

# function to test
# Copyright 2024 Lightricks and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

class Dummy:
    # Dummy class for staticmethod access
    @staticmethod
    def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int = 1) -> torch.Tensor:
        # Unpacked latents of shape are [B, C, F, H, W] are patched into tokens of shape [B, C, F // p_t, p_t, H // p, p, W // p, p].
        # The patch dimensions are then permuted and collapsed into the channel dimension of shape:
        # [B, F // p_t * H // p * W // p, C * p_t * p * p] (an ndim=3 tensor).
        # dim=0 is the batch size, dim=1 is the effective video sequence length, dim=2 is the effective number of input features
        batch_size, num_channels, num_frames, height, width = latents.shape
        post_patch_num_frames = num_frames // patch_size_t
        post_patch_height = height // patch_size
        post_patch_width = width // patch_size
        latents = latents.reshape(
            batch_size,
            -1,
            post_patch_num_frames,
            patch_size_t,
            post_patch_height,
            patch_size,
            post_patch_width,
            patch_size,
        )
        latents = latents.permute(0, 2, 4, 6, 1, 3, 5, 7).flatten(4, 7).flatten(1, 3)
        return latents

_pack_latents = Dummy._pack_latents

# unit tests

# 1. BASIC TEST CASES

def test_identity_patch_size_1():
    # Patch size 1 should return a flattened version with correct shape
    latents = torch.arange(2*3*4*5*6).reshape(2,3,4,5,6)
    codeflash_output = _pack_latents(latents, patch_size=1, patch_size_t=1); out = codeflash_output
    # Check that the values match the expected
    # For patch_size=1, patch_size_t=1, the flattening should preserve the order for each (b, c)
    for b in range(2):
        for c in range(3):
            orig = latents[b, c].flatten()
            packed = out[b,:,c]

def test_patch_size_2_spatial():
    # Patch size 2 spatial, 1 temporal
    latents = torch.arange(1*1*4*4*4).reshape(1,1,4,4,4)
    codeflash_output = _pack_latents(latents, patch_size=2, patch_size_t=1); out = codeflash_output
    # Each patch should be a 2x2 block in spatial dims, check a few patches
    # For first frame, first patch (top-left), should be latents[0,0,0,0:2,0:2].flatten()
    expected_patch = latents[0,0,0,0:2,0:2].flatten()
    # Second patch in first frame: (top-right)
    expected_patch2 = latents[0,0,0,0:2,2:4].flatten()

def test_patch_size_2_temporal():
    # Patch size 2 temporal, 1 spatial
    latents = torch.arange(1*1*4*2*2).reshape(1,1,4,2,2)
    codeflash_output = _pack_latents(latents, patch_size=1, patch_size_t=2); out = codeflash_output
    # Each patch should be a 2-frame block in temporal dim, check first patch
    expected_patch = latents[0,0,0:2,0,0].flatten()
    # Next patch: frames 2:4, 0,0
    expected_patch2 = latents[0,0,2:4,0,0].flatten()

def test_patch_size_2_both():
    # Patch size 2 for both spatial and temporal
    latents = torch.arange(1*1*4*4*4).reshape(1,1,4,4,4)
    codeflash_output = _pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output
    # First patch: frames 0:2, H 0:2, W 0:2
    expected_patch = latents[0,0,0:2,0:2,0:2].flatten()

def test_multiple_channels_and_batch():
    # Test with batch >1 and channels >1
    latents = torch.arange(2*2*2*2*2).reshape(2,2,2,2,2)
    codeflash_output = _pack_latents(latents, patch_size=1, patch_size_t=1); out = codeflash_output
    # Check that each batch and channel is correct
    for b in range(2):
        for c in range(2):
            orig = latents[b, c].flatten()
            packed = out[b,:,c]

# 2. EDGE TEST CASES

def test_minimal_input():
    # Minimal valid input: all dims=1
    latents = torch.tensor([[[[[42]]]]], dtype=torch.float)
    codeflash_output = _pack_latents(latents, patch_size=1, patch_size_t=1); out = codeflash_output

def test_patch_size_equals_dim():
    # Patch size equals to dimension
    latents = torch.arange(1*1*4*4*4).reshape(1,1,4,4,4)
    codeflash_output = _pack_latents(latents, patch_size=4, patch_size_t=4); out = codeflash_output

def test_non_divisible_patch_size_spatial():
    # Patch size that does not divide H or W should raise
    latents = torch.zeros(1,1,4,5,6)
    with pytest.raises(RuntimeError):
        _pack_latents(latents, patch_size=3, patch_size_t=1)

def test_non_divisible_patch_size_temporal():
    # Patch size that does not divide F should raise
    latents = torch.zeros(1,1,5,4,4)
    with pytest.raises(RuntimeError):
        _pack_latents(latents, patch_size=1, patch_size_t=2)

def test_zero_patch_size():
    # Patch size 0 should raise
    latents = torch.zeros(1,1,4,4,4)
    with pytest.raises(ZeroDivisionError):
        _pack_latents(latents, patch_size=0, patch_size_t=1)
    with pytest.raises(ZeroDivisionError):
        _pack_latents(latents, patch_size=1, patch_size_t=0)

def test_negative_patch_size():
    # Negative patch size should raise
    latents = torch.zeros(1,1,4,4,4)
    with pytest.raises(RuntimeError):
        _pack_latents(latents, patch_size=-1, patch_size_t=1)
    with pytest.raises(RuntimeError):
        _pack_latents(latents, patch_size=1, patch_size_t=-2)

def test_large_patch_size():
    # Patch size larger than dimension should raise
    latents = torch.zeros(1,1,4,4,4)
    with pytest.raises(RuntimeError):
        _pack_latents(latents, patch_size=5, patch_size_t=1)
    with pytest.raises(RuntimeError):
        _pack_latents(latents, patch_size=1, patch_size_t=5)

def test_noncontiguous_input():
    # Non-contiguous input should still work
    latents = torch.arange(1*1*4*4*4).reshape(1,1,4,4,4)
    latents = latents.permute(0,1,2,4,3)  # make non-contiguous
    codeflash_output = _pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output

def test_float_and_int_types():
    # Should work for both float and int tensors
    latents_f = torch.ones(1,1,2,2,2, dtype=torch.float32)
    latents_i = torch.ones(1,1,2,2,2, dtype=torch.int64)
    codeflash_output = _pack_latents(latents_f, patch_size=1, patch_size_t=1); out_f = codeflash_output
    codeflash_output = _pack_latents(latents_i, patch_size=1, patch_size_t=1); out_i = codeflash_output

# 3. LARGE SCALE TEST CASES

def test_large_tensor_patch_size_1():
    # Large tensor, patch size 1
    B, C, F, H, W = 2, 3, 8, 8, 8  # 2*3*8*8*8 = 3072 elements, < 100MB
    latents = torch.arange(B*C*F*H*W).reshape(B,C,F,H,W)
    codeflash_output = _pack_latents(latents, patch_size=1, patch_size_t=1); out = codeflash_output

def test_large_tensor_patch_size_2():
    # Large tensor, patch size 2
    B, C, F, H, W = 1, 2, 8, 16, 16  # 1*2*8*16*16 = 4096*8 = 32768 elements, < 100MB
    latents = torch.arange(B*C*F*H*W).reshape(B,C,F,H,W)
    codeflash_output = _pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output

def test_large_batch_and_channels():
    # Large batch and channel count
    B, C, F, H, W = 10, 10, 4, 4, 4  # 10*10*4*4*4 = 6400 elements
    latents = torch.ones(B,C,F,H,W)
    codeflash_output = _pack_latents(latents, patch_size=2, patch_size_t=2); out = codeflash_output

def test_maximum_allowed_size():
    # Tensor close to 100MB: float32, 1e7 elements ~ 40MB, so 2e7 ~ 80MB
    # Let's use (2,2,16,32,32): 2*2*16*32*32 = 65536 elements, float32 = 262144 bytes = 0.25MB, so we can go bigger
    # Let's use (1,1,32,128,128): 1*1*32*128*128 = 524288 elements, 2MB
    # Let's use (1,1,64,256,64): 1*1*64*256*64 = 1,048,576 elements, 4MB
    # Let's try (1,1,32,512,512): 1*1*32*512*512 = 8,388,608 elements, 32MB
    latents = torch.ones(1,1,32,512,512)
    codeflash_output = _pack_latents(latents, patch_size=8, patch_size_t=4); out = codeflash_output
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

To edit these changes git checkout codeflash/optimize-LTXPipeline._pack_latents-mbdhwgg0 and push.

Here is an optimized version of your program for better speed and memory. Main changes. - **Avoid getattr with fallback**: Using getattr inside __init__ with a default fallback is not needed if you already have the argument. Use arg directly. This reduces Python attribute lookups and ensures early errors. - **Remove unnecessary 'if getattr' defaults**: If you always call the constructor with real objects, you don't need these checks. - **Optimize _pack_latents**: Refactor to use a single reshape and permute, with merged flatten, which is more explicit and minimizes intermediate objects. Compute shape ahead, avoid unnecessary -1 reshape argument (faster in PyTorch). - **In __init__**, precompute scalar attributes to local variables before repeated access. You can try further JIT/CUDA-level improvements, but this is close to optimal for a pipeline utility function in pure Python and PyTorch. **Key Speedups**. - Fewer PyTorch metadata computations. - No unnecessary lookups on self.X when argument is available. - Minimized reshaping, and clearer patching for compiler efficiency. **The pipeline logic and API remain unchanged.**

codeflash-ai bot added the ⚡️ codeflash Optimization PR opened by Codeflash AI label Jun 1, 2025

codeflash-ai bot requested a review from aseembits93 June 1, 2025 10:05

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

⚡️ Speed up method `LTXPipeline._pack_latents` by 5%#129

⚡️ Speed up method `LTXPipeline._pack_latents` by 5%#129
codeflash-ai[bot] wants to merge 1 commit intomainfrom
codeflash/optimize-LTXPipeline._pack_latents-mbdhwgg0

codeflash-ai bot commented Jun 1, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

0 participants

Conversation

codeflash-ai bot commented Jun 1, 2025

📄 5% (0.05x) speedup for LTXPipeline._pack_latents in src/diffusers/pipelines/ltx/pipeline_ltx.py

📝 Explanation and details

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

0 participants

📄 5% (0.05x) speedup for `LTXPipeline._pack_latents` in `src/diffusers/pipelines/ltx/pipeline_ltx.py`