From d7699929b129801a15f7ed35707a353dae7877a7 Mon Sep 17 00:00:00 2001 From: Sotaro Hikita Date: Tue, 14 Apr 2026 08:41:40 +0900 Subject: [PATCH] Fix DELETED manifest entry snapshot_id in _OverwriteFiles When _OverwriteFiles._deleted_entries() creates DELETED manifest entries, it now sets snapshot_id to the current (deleting) snapshot's ID instead of retaining the original INSERT snapshot's ID. This aligns with the Iceberg spec which states that for DELETED entries, snapshot_id should be the snapshot in which the file was deleted. The existing _DeleteFiles._compute_deletes() already handled this correctly. Signed-off-by: Sotaro Hikita --- pyiceberg/table/update/snapshot.py | 2 +- tests/integration/test_deletes.py | 55 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py index 37d120969a..1df4e64c24 100644 --- a/pyiceberg/table/update/snapshot.py +++ b/pyiceberg/table/update/snapshot.py @@ -652,7 +652,7 @@ def _get_entries(manifest: ManifestFile) -> list[ManifestEntry]: return [ ManifestEntry.from_args( status=ManifestEntryStatus.DELETED, - snapshot_id=entry.snapshot_id, + snapshot_id=self._snapshot_id, sequence_number=entry.sequence_number, file_sequence_number=entry.file_sequence_number, data_file=entry.data_file, diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py index e3b487e465..711333529e 100644 --- a/tests/integration/test_deletes.py +++ b/tests/integration/test_deletes.py @@ -975,3 +975,58 @@ def assert_manifest_entry(expected_status: ManifestEntryStatus, expected_snapsho assert after_delete_snapshot is not None assert_manifest_entry(ManifestEntryStatus.DELETED, after_delete_snapshot.snapshot_id) + + +@pytest.mark.integration +def test_manifest_entry_snapshot_id_after_partial_deletes(session_catalog: RestCatalog) -> None: + """Test that DELETED manifest entries from a CoW overwrite (partial delete) have the correct snapshot_id. + + When only some rows match the delete filter, PyIceberg rewrites the file via _OverwriteFiles. + The DELETED entry's snapshot_id must be the deleting snapshot's ID, not the original INSERT snapshot's ID. + See: https://github.com/apache/iceberg-python/issues/3236 + """ + identifier = "default.test_manifest_entry_snapshot_id_after_partial_deletes" + try: + session_catalog.drop_table(identifier) + except NoSuchTableError: + pass + + schema = pa.schema( + [ + ("id", pa.int32()), + ("name", pa.string()), + ] + ) + + table = session_catalog.create_table(identifier, schema) + data = pa.Table.from_pylist( + [ + {"id": 1, "name": "keep"}, + {"id": 2, "name": "keep"}, + {"id": 3, "name": "delete_me"}, + {"id": 4, "name": "delete_me"}, + ], + schema=schema, + ) + table.append(data) + + # Partial delete: only some rows match, triggering CoW overwrite via _OverwriteFiles + table.delete(EqualTo("name", "delete_me")) + after_delete_snapshot = table.refresh().current_snapshot() + assert after_delete_snapshot is not None + + manifests = after_delete_snapshot.manifests(table.io) + deleted_entries = [ + entry + for manifest in manifests + for entry in manifest.fetch_manifest_entry(table.io, discard_deleted=False) + if entry.status == ManifestEntryStatus.DELETED + ] + + assert len(deleted_entries) > 0, "Expected at least one DELETED manifest entry from the CoW overwrite" + + for entry in deleted_entries: + assert entry.snapshot_id == after_delete_snapshot.snapshot_id, ( + f"DELETED entry snapshot_id should be {after_delete_snapshot.snapshot_id} " + f"(the deleting snapshot), but was {entry.snapshot_id}" + )