Skip to content

Transaction._build_partition_predicate() builds a left-deep tree and blows up on large overwrite commits #3267

@Kuinox

Description

@Kuinox

Apache Iceberg version

0.11.0.dev20260416005052

Please describe the bug 🐞

Hello,
I run one of the CI build of pyiceberg (0.11.0.dev20260416005052).
I run pyiceberg on a big dataset, I needed the recent optimisation on overwrite: #3011
Without that, pyiceberg is scanning all the manifests.
Sadly, it introduced a regression, now my batch fails due to a too deep recursion in pyiceberg.
Here is a minimal repro:

from pathlib import Path
from tempfile import TemporaryDirectory
from pyiceberg.catalog import load_catalog
from pyiceberg.partitioning import PartitionField, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.transforms import IdentityTransform
from pyiceberg.types import NestedField


def make_row(partition_value: str, value: int) -> pa.Table:
    return pa.table(
        {
            "date": pa.array([partition_value], type=pa.large_string()),
            "value": pa.array([value], type=pa.int64()),
        }
    )


with TemporaryDirectory() as tmpdir:
    warehouse = Path(tmpdir, "warehouse")
    warehouse.mkdir(parents=True, exist_ok=True)
    warehouse_uri = f"file://{warehouse.resolve().as_posix()}"

    catalog = load_catalog(
        "local",
        type="in-memory",
        warehouse=warehouse_uri,
    )
    catalog.create_namespace("default")

    schema = Schema(
        NestedField(field_id=1, name="date", field_type="string", required=False),
        NestedField(field_id=2, name="value", field_type="long", required=False),
    )
    partition_spec = PartitionSpec(
        PartitionField(
            source_id=1,
            field_id=1000,
            transform=IdentityTransform(),
            name="date",
        )
    )

    table = catalog.create_table(
        "default.repro",
        schema=schema,
        partition_spec=partition_spec,
    )

    # Seed 512 distinct partition files.
    for i in range(512):
        table.append(make_row(f"2026-02-{i:04d}T00", 1))

    files_to_delete = [task.file for task in table.scan().plan_files()]
    print(f"files_to_delete={len(files_to_delete)}")

    # Repro: open a transaction and do an overwrite, like our code path.
    with table.transaction() as tx:
        with tx.update_snapshot().overwrite() as overwrite_snapshot:
            for data_file in files_to_delete:
                overwrite_snapshot.delete_data_file(data_file)

This fails with the following error:

[... truncated ...]
  File "C:\Dev\python\.venv\Lib\site-packages\pyiceberg\expressions\visitors.py", line 193, in _
    left_result: T = visit(obj.left, visitor=visitor)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python311\Lib\functools.py", line 909, in wrapper
    return dispatch(args[0].__class__)(*args, **kw)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Dev\python\.venv\Lib\site-packages\pyiceberg\expressions\visitors.py", line 193, in _
    left_result: T = visit(obj.left, visitor=visitor)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python311\Lib\functools.py", line 909, in wrapper
    return dispatch(args[0].__class__)(*args, **kw)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Dev\python\.venv\Lib\site-packages\pyiceberg\expressions\visitors.py", line 193, in _
    left_result: T = visit(obj.left, visitor=visitor)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python311\Lib\functools.py", line 909, in wrapper
    return dispatch(args[0].__class__)(*args, **kw)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Dev\python\.venv\Lib\site-packages\pyiceberg\expressions\visitors.py", line 193, in _
    left_result: T = visit(obj.left, visitor=visitor)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python311\Lib\functools.py", line 909, in wrapper
    return dispatch(args[0].__class__)(*args, **kw)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Dev\python\.venv\Lib\site-packages\pyiceberg\expressions\visitors.py", line 193, in _
    left_result: T = visit(obj.left, visitor=visitor)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python311\Lib\functools.py", line 909, in wrapper
    return dispatch(args[0].__class__)(*args, **kw)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python311\Lib\functools.py", line 832, in dispatch
    impl = dispatch_cache[cls]
           ~~~~~~~~~~~~~~^^^^^
  File "C:\Program Files\Python311\Lib\weakref.py", line 415, in __getitem__
    return self.data[ref(key)]
           ~~~~~~~~~^^^^^^^^^^
RecursionError: maximum recursion depth exceeded in comparison

Willingness to contribute

  • I can contribute a fix for this bug independently
  • I would be willing to contribute a fix for this bug with guidance from the Iceberg community
  • I cannot contribute a fix for this bug at this time

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions