From 3085e6132b8dc32fff446077039bed70eb151ac8 Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Fri, 22 May 2026 14:21:27 +0800 Subject: [PATCH 1/4] feat(update): add MergingSnapshotUpdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Abstract base for merge-based snapshot operations (MergeAppend, OverwriteFiles, RowDelta, etc.), implementing the filter → write → merge pipeline consistent with Java's MergingSnapshotProducer. Also fixes SnapshotSummaryBuilder manifest count fields and a use-after-free bug in SnapshotUpdate::DeleteFile. --- .github/workflows/pre-commit.yml | 2 +- src/iceberg/CMakeLists.txt | 1 + .../manifest/manifest_filter_manager.cc | 37 +- .../manifest/manifest_filter_manager.h | 34 + .../manifest/manifest_merge_manager.cc | 4 + src/iceberg/manifest/manifest_merge_manager.h | 5 + src/iceberg/meson.build | 1 + src/iceberg/snapshot.cc | 24 + src/iceberg/snapshot.h | 13 + src/iceberg/test/CMakeLists.txt | 1 + .../test/manifest_filter_manager_test.cc | 183 ++++ .../test/merging_snapshot_update_test.cc | 736 +++++++++++++++ src/iceberg/update/merging_snapshot_update.cc | 870 ++++++++++++++++++ src/iceberg/update/merging_snapshot_update.h | 350 +++++++ src/iceberg/update/snapshot_update.cc | 8 +- .../update/update_partition_statistics.h | 5 +- 16 files changed, 2263 insertions(+), 11 deletions(-) create mode 100644 src/iceberg/test/merging_snapshot_update_test.cc create mode 100644 src/iceberg/update/merging_snapshot_update.cc create mode 100644 src/iceberg/update/merging_snapshot_update.h diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 583f280bd..479ed8b45 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -25,7 +25,7 @@ on: - '!dependabot/**' permissions: - contents: read + contents: write jobs: pre-commit: diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 68cacebeb..145cafe50 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -88,6 +88,7 @@ set(ICEBERG_SOURCES type.cc update/expire_snapshots.cc update/fast_append.cc + update/merging_snapshot_update.cc update/pending_update.cc update/set_snapshot.cc update/snapshot_manager.cc diff --git a/src/iceberg/manifest/manifest_filter_manager.cc b/src/iceberg/manifest/manifest_filter_manager.cc index 086c94a78..2b53158bc 100644 --- a/src/iceberg/manifest/manifest_filter_manager.cc +++ b/src/iceberg/manifest/manifest_filter_manager.cc @@ -117,12 +117,24 @@ bool ManifestFilterManager::ContainsDeletes() const { !drop_partitions_.empty(); } +void ManifestFilterManager::DropDeleteFilesOlderThan(int64_t sequence_number) { + min_sequence_number_ = sequence_number; +} + +void ManifestFilterManager::RemoveDanglingDeletesFor(const DataFileSet& deleted_files) { + for (const auto& file : deleted_files) { + removed_data_file_paths_.insert(file->file_path); + } +} + Result ManifestFilterManager::CanContainDroppedFiles(const ManifestFile&) const { // TODO(Guotao): Use the manifest descriptor to skip unrelated object-delete // manifests once object-delete partitions are tracked separately. // Currently, DeleteFile(std::shared_ptr) degrades to a path-based delete, // which forces scanning all manifests. - return !delete_paths_.empty(); + // Also open delete manifests when a minimum sequence number is set for cleanup. + return !delete_paths_.empty() || !removed_data_file_paths_.empty() || + (manifest_content_ == ManifestContent::kDeletes && min_sequence_number_ > 0); } Result ManifestFilterManager::CanContainDroppedPartitions( @@ -219,6 +231,25 @@ Result ManifestFilterManager::ShouldDelete(const ManifestEntry& entry, return true; } + // Delete-manifest-specific cleanup (only for ManifestContent::kDeletes). + if (manifest_content_ == ManifestContent::kDeletes) { + // Drop delete files whose data sequence number is older than the minimum + // retained by the table (they can no longer match any live data rows). + // seq == 0 (kInitialSequenceNumber / nullopt) is intentionally excluded: + // those entries predate sequence number assignment and must not be pruned. + int64_t seq = entry.sequence_number.value_or(0); + if (min_sequence_number_ > 0 && seq > 0 && seq < min_sequence_number_) { + return true; + } + + // Drop DVs that reference a data file that has been removed (dangling DV). + if (!removed_data_file_paths_.empty() && file.IsDeletionVector() && + file.referenced_data_file.has_value() && + removed_data_file_paths_.count(*file.referenced_data_file)) { + return true; + } + } + if (HasRowFilterExpression(delete_expr_)) { ICEBERG_ASSIGN_OR_RAISE(auto* residual_eval, GetResidualEvaluator(schema, specs_by_id, spec_id)); @@ -403,6 +434,7 @@ Result> ManifestFilterManager::FilterManifests( bool trust_manifest_references = CanTrustManifestReferences(manifests); manifest_evaluator_cache_.clear(); residual_evaluator_cache_.clear(); + replaced_manifests_count_ = 0; // TODO(Guotao): Parallelize manifest filtering with per-manifest results, then // merge found paths and deleted files after the loop. @@ -413,6 +445,9 @@ Result> ManifestFilterManager::FilterManifests( auto filtered_manifest, FilterManifest(schema, specs_by_id, *manifest_ptr, trust_manifest_references, writer_factory, found_paths)); + if (filtered_manifest.manifest_path != manifest_ptr->manifest_path) { + ++replaced_manifests_count_; + } filtered.push_back(std::move(filtered_manifest)); } diff --git a/src/iceberg/manifest/manifest_filter_manager.h b/src/iceberg/manifest/manifest_filter_manager.h index 55258b2b1..981b9ac3b 100644 --- a/src/iceberg/manifest/manifest_filter_manager.h +++ b/src/iceberg/manifest/manifest_filter_manager.h @@ -116,9 +116,36 @@ class ICEBERG_EXPORT ManifestFilterManager { /// manifest entry matches a delete condition. void FailAnyDelete(); + /// \brief Returns the number of manifests rewritten (replaced) by the last + /// FilterManifests() call. A manifest is replaced when it contained deleted entries + /// and was rewritten with those entries marked DELETED. + int32_t ReplacedManifestsCount() const { return replaced_manifests_count_; } + /// \brief Returns true if any delete condition has been registered. bool ContainsDeletes() const; + /// \brief Set the minimum data sequence number for delete files to retain. + /// + /// Only valid for ManifestContent::kDeletes managers. Delete entries whose + /// data_sequence_number is positive and less than \p sequence_number will be + /// marked DELETED. This continuously removes delete files that cannot match + /// any remaining data rows (i.e. all data written before that sequence number + /// has itself been deleted). + /// + /// \param sequence_number the inclusive lower bound; delete files older than + /// this value are dropped + void DropDeleteFilesOlderThan(int64_t sequence_number); + + /// \brief Register data files that have been removed so their dangling DVs + /// can be cleaned up. + /// + /// Only valid for ManifestContent::kDeletes managers. For each DV whose + /// referenced_data_file path appears in \p deleted_files, the DV entry is + /// marked DELETED because the data file it targets no longer exists. + /// + /// \param deleted_files set of data files that have been marked for deletion + void RemoveDanglingDeletesFor(const DataFileSet& deleted_files); + /// \brief Apply all accumulated delete conditions to the base snapshot's manifests. /// /// Manifests that cannot possibly contain deleted files are returned unchanged. @@ -220,6 +247,13 @@ class ICEBERG_EXPORT ManifestFilterManager { bool fail_any_delete_{false}; bool case_sensitive_{true}; + int32_t replaced_manifests_count_{0}; + + // minimum data sequence number; delete entries older than this are dropped + int64_t min_sequence_number_{0}; + // paths of data files that were removed; DVs referencing these are dangling + std::unordered_set removed_data_file_paths_; + std::unordered_map> manifest_evaluator_cache_; std::unordered_map> diff --git a/src/iceberg/manifest/manifest_merge_manager.cc b/src/iceberg/manifest/manifest_merge_manager.cc index 056dce3f5..aedcea735 100644 --- a/src/iceberg/manifest/manifest_merge_manager.cc +++ b/src/iceberg/manifest/manifest_merge_manager.cc @@ -57,6 +57,7 @@ Result> ManifestMergeManager::MergeManifests( std::ranges::copy(manifest_ranges | std::views::join, std::back_inserter(all)); if (all.empty() || !merge_enabled_) { + replaced_manifests_count_ = 0; return all | std::views::transform([](const ManifestFile* manifest) { return *manifest; }) | std::ranges::to>(); @@ -82,6 +83,7 @@ Result> ManifestMergeManager::MergeManifests( std::vector result; result.reserve(all.size()); + replaced_manifests_count_ = 0; for (auto& [key, group] : by_spec) { const auto* first = first_by_content.at(key.second); ICEBERG_ASSIGN_OR_RAISE(auto merged, MergeGroup(group, first, snapshot_id, metadata, @@ -140,6 +142,8 @@ Result> ManifestMergeManager::MergeGroup( } else { ICEBERG_ASSIGN_OR_RAISE( auto merged, FlushBin(bin, snapshot_id, metadata, file_io, writer_factory)); + // Each manifest consumed into the merged output (beyond the 1 output) is replaced. + replaced_manifests_count_ += static_cast(bin.size()) - 1; result.push_back(std::move(merged)); } } diff --git a/src/iceberg/manifest/manifest_merge_manager.h b/src/iceberg/manifest/manifest_merge_manager.h index 16cc8d987..614ab61c6 100644 --- a/src/iceberg/manifest/manifest_merge_manager.h +++ b/src/iceberg/manifest/manifest_merge_manager.h @@ -84,6 +84,10 @@ class ICEBERG_EXPORT ManifestMergeManager { const TableMetadata& metadata, std::shared_ptr file_io, const ManifestWriterFactory& writer_factory); + /// \brief Returns the number of manifests replaced (consumed into merged outputs) + /// by the last MergeManifests() call. + int32_t ReplacedManifestsCount() const { return replaced_manifests_count_; } + private: /// \brief Merge a group of manifests sharing the same spec_id. /// @@ -109,6 +113,7 @@ class ICEBERG_EXPORT ManifestMergeManager { const int64_t target_size_bytes_; const int32_t min_count_to_merge_; const bool merge_enabled_; + int32_t replaced_manifests_count_{0}; }; } // namespace iceberg diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 03dc24479..4c28bdbc2 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -110,6 +110,7 @@ iceberg_sources = files( 'type.cc', 'update/expire_snapshots.cc', 'update/fast_append.cc', + 'update/merging_snapshot_update.cc', 'update/pending_update.cc', 'update/set_snapshot.cc', 'update/snapshot_manager.cc', diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc index 1b3182fd9..d513e2be1 100644 --- a/src/iceberg/snapshot.cc +++ b/src/iceberg/snapshot.cc @@ -441,6 +441,10 @@ void SnapshotSummaryBuilder::Clear() { metrics_.Clear(); deleted_duplicate_files_ = 0; trust_partition_metrics_ = true; + manifests_counts_set_ = false; + manifests_created_ = 0; + manifests_kept_ = 0; + manifests_replaced_ = 0; } void SnapshotSummaryBuilder::SetPartitionSummaryLimit(int32_t max) { @@ -475,6 +479,14 @@ void SnapshotSummaryBuilder::Set(const std::string& property, const std::string& properties_[property] = value; } +void SnapshotSummaryBuilder::SetManifestCounts(int32_t created, int32_t kept, + int32_t replaced) { + manifests_counts_set_ = true; + manifests_created_ = created; + manifests_kept_ = kept; + manifests_replaced_ = replaced; +} + void SnapshotSummaryBuilder::Merge(const SnapshotSummaryBuilder& other) { for (const auto& [key, value] : other.properties_) { properties_[key] = value; @@ -491,6 +503,10 @@ void SnapshotSummaryBuilder::Merge(const SnapshotSummaryBuilder& other) { } deleted_duplicate_files_ += other.deleted_duplicate_files_; + // Manifest counts (manifests_counts_set_ / manifests_created_ / manifests_kept_ / + // manifests_replaced_) are intentionally not merged here. They are set directly + // on the root summary builder by Apply() after all manifests are finalized, and + // are never populated on sub-builders that get Merge()d in. } std::unordered_map SnapshotSummaryBuilder::Build() const { @@ -504,6 +520,14 @@ std::unordered_map SnapshotSummaryBuilder::Build() con SetIf(deleted_duplicate_files_ > 0, builder, SnapshotSummaryFields::kDeletedDuplicatedFiles, deleted_duplicate_files_); + // Always emit all three manifest count fields together when they have been set. + SetIf(manifests_counts_set_, builder, SnapshotSummaryFields::kManifestsCreated, + manifests_created_); + SetIf(manifests_counts_set_, builder, SnapshotSummaryFields::kManifestsKept, + manifests_kept_); + SetIf(manifests_counts_set_, builder, SnapshotSummaryFields::kManifestsReplaced, + manifests_replaced_); + SetIf(trust_partition_metrics_, builder, SnapshotSummaryFields::kChangedPartitionCountProp, partition_metrics_.size()); diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h index f3e7ffb85..178c21dd7 100644 --- a/src/iceberg/snapshot.h +++ b/src/iceberg/snapshot.h @@ -338,6 +338,15 @@ class ICEBERG_EXPORT SnapshotSummaryBuilder { /// \param value Property value void Set(const std::string& property, const std::string& value); + /// \brief Set manifest count summary fields. + /// + /// Records how many manifests were created, kept, and replaced in this snapshot. + /// + /// \param created Manifests written by this snapshot + /// \param kept Manifests carried over unchanged from the previous snapshot + /// \param replaced Manifests rewritten or merged away + void SetManifestCounts(int32_t created, int32_t kept, int32_t replaced); + /// \brief Merge another builder's metrics into this one /// /// \param other The builder to merge from @@ -359,6 +368,10 @@ class ICEBERG_EXPORT SnapshotSummaryBuilder { int32_t max_changed_partitions_for_summaries_{0}; int64_t deleted_duplicate_files_{0}; bool trust_partition_metrics_{true}; + bool manifests_counts_set_{false}; + int32_t manifests_created_{0}; + int32_t manifests_kept_{0}; + int32_t manifests_replaced_{0}; }; /// \brief Data operation that produce snapshots. diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 37e66ed59..0b7dc64b8 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -208,6 +208,7 @@ if(ICEBERG_BUILD_BUNDLE) expire_snapshots_test.cc fast_append_test.cc manifest_filter_manager_test.cc + merging_snapshot_update_test.cc name_mapping_update_test.cc snapshot_manager_test.cc transaction_test.cc diff --git a/src/iceberg/test/manifest_filter_manager_test.cc b/src/iceberg/test/manifest_filter_manager_test.cc index 7810509fa..aa1054fec 100644 --- a/src/iceberg/test/manifest_filter_manager_test.cc +++ b/src/iceberg/test/manifest_filter_manager_test.cc @@ -41,6 +41,7 @@ #include "iceberg/test/matchers.h" #include "iceberg/test/update_test_base.h" #include "iceberg/update/fast_append.h" +#include "iceberg/util/data_file_set.h" #include "iceberg/util/macros.h" namespace iceberg { @@ -379,4 +380,186 @@ TEST_F(ManifestFilterManagerTest, MultipleRowFiltersUseCombinedExpression) { EXPECT_EQ(entries[0].status, ManifestStatus::kDeleted); } +// Helper: write one or more delete-file entries to a new manifest. +// Each entry is (DataFile, data_sequence_number). +using DeleteManifestEntry = std::pair, int64_t>; +static Result WriteDeleteManifest( + const std::vector& files, std::shared_ptr file_io, + const TableMetadata& metadata, const std::string& path) { + if (files.empty()) { + return InvalidArgument("WriteDeleteManifest requires at least one entry"); + } + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + int32_t spec_id = files[0].first->partition_spec_id.value_or(0); + ICEBERG_ASSIGN_OR_RAISE(auto spec, metadata.PartitionSpecById(spec_id)); + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + ManifestWriter::MakeWriter(metadata.format_version, /*snapshot_id=*/1L, path, + file_io, spec, schema, ManifestContent::kDeletes)); + for (auto& [file, seq] : files) { + ManifestEntry entry; + entry.status = ManifestStatus::kAdded; + entry.snapshot_id = 1L; + entry.sequence_number = seq; + entry.data_file = file; + ICEBERG_RETURN_UNEXPECTED(writer->WriteAddedEntry(entry)); + } + ICEBERG_RETURN_UNEXPECTED(writer->Close()); + return writer->ToManifestFile(); +} + +// Convenience overload for a single entry. +static Result WriteDeleteManifest(std::shared_ptr delete_file, + int64_t data_sequence_number, + std::shared_ptr file_io, + const TableMetadata& metadata, + const std::string& path) { + return WriteDeleteManifest({{delete_file, data_sequence_number}}, file_io, metadata, + path); +} + +TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThan) { + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + // Create a position-delete file with data_sequence_number = 2 (below threshold 5). + auto del_file = std::make_shared(); + del_file->content = DataFile::Content::kPositionDeletes; + del_file->file_path = table_location_ + "/delete/del_old.parquet"; + del_file->file_format = FileFormatType::kParquet; + del_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + del_file->file_size_in_bytes = 512; + del_file->record_count = 10; + del_file->partition_spec_id = spec_->spec_id(); + + auto manifest_path = std::format("{}/metadata/del-manifest-{}.avro", table_location_, + manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto del_manifest, + WriteDeleteManifest(del_file, /*data_seq=*/2L, file_io_, *metadata, manifest_path)); + + ManifestFilterManager mgr(ManifestContent::kDeletes, file_io_); + // Drop delete files older than sequence number 5: entry (seq=2) should be dropped. + mgr.DropDeleteFilesOlderThan(5); + + std::vector manifests{&del_manifest}; + auto specs = SpecsById(*metadata); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr.FilterManifests(schema, specs, manifests, factory)); + + ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); + ASSERT_EQ(entries.size(), 1U); + EXPECT_EQ(entries[0].status, ManifestStatus::kDeleted); +} + +TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThanKeepsNewerEntries) { + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + // Two entries in the same manifest: old (seq=2, below threshold) and new (seq=10, + // above). + auto make_del_file = [&](const std::string& path) { + auto f = std::make_shared(); + f->content = DataFile::Content::kPositionDeletes; + f->file_path = path; + f->file_format = FileFormatType::kParquet; + f->partition = PartitionValues(std::vector{Literal::Long(1L)}); + f->file_size_in_bytes = 512; + f->record_count = 10; + f->partition_spec_id = spec_->spec_id(); + return f; + }; + auto old_file = make_del_file(table_location_ + "/delete/del_old.parquet"); + auto new_file = make_del_file(table_location_ + "/delete/del_new.parquet"); + + auto manifest_path = std::format("{}/metadata/del-manifest-{}.avro", table_location_, + manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL(auto del_manifest, + WriteDeleteManifest({{old_file, 2L}, {new_file, 10L}}, file_io_, + *metadata, manifest_path)); + + ManifestFilterManager mgr(ManifestContent::kDeletes, file_io_); + // Threshold=5: old entry dropped, new entry survives as kExisting. + mgr.DropDeleteFilesOlderThan(5); + + std::vector manifests{&del_manifest}; + auto specs = SpecsById(*metadata); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr.FilterManifests(schema, specs, manifests, factory)); + + ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); + ASSERT_EQ(entries.size(), 2U); + // The old entry should be dropped; the new entry should survive. + auto deleted = std::count_if( + entries.begin(), entries.end(), + [](const ManifestEntry& e) { return e.status == ManifestStatus::kDeleted; }); + auto existing = std::count_if( + entries.begin(), entries.end(), + [](const ManifestEntry& e) { return e.status == ManifestStatus::kExisting; }); + EXPECT_EQ(deleted, 1); + EXPECT_EQ(existing, 1); + // Verify which entry survived. + for (const auto& e : entries) { + if (e.status == ManifestStatus::kExisting) { + EXPECT_EQ(e.data_file->file_path, new_file->file_path); + } else { + EXPECT_EQ(e.data_file->file_path, old_file->file_path); + } + } +} + +TEST_F(ManifestFilterManagerTest, RemoveDanglingDeletesForFiltersDanglingDV) { + auto* metadata = table_->metadata().get(); + auto factory = MakeWriterFactory(*metadata); + + const std::string data_file_path = table_location_ + "/data/referenced.parquet"; + + // Create a DV (position-delete, puffin format) referencing the data file. + auto dv_file = std::make_shared(); + dv_file->content = DataFile::Content::kPositionDeletes; + dv_file->file_path = table_location_ + "/delete/dv.puffin"; + dv_file->file_format = FileFormatType::kPuffin; + dv_file->referenced_data_file = data_file_path; + dv_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + dv_file->file_size_in_bytes = 256; + dv_file->record_count = 5; + dv_file->partition_spec_id = spec_->spec_id(); + + auto manifest_path = std::format("{}/metadata/dv-manifest-{}.avro", table_location_, + manifest_counter_++); + ICEBERG_UNWRAP_OR_FAIL( + auto dv_manifest, + WriteDeleteManifest(dv_file, /*data_seq=*/3L, file_io_, *metadata, manifest_path)); + + // Register the referenced data file as deleted. + auto deleted_data_file = std::make_shared(); + deleted_data_file->content = DataFile::Content::kData; + deleted_data_file->file_path = data_file_path; + deleted_data_file->partition = PartitionValues(std::vector{Literal::Long(1L)}); + deleted_data_file->file_size_in_bytes = 1024; + deleted_data_file->record_count = 50; + deleted_data_file->partition_spec_id = spec_->spec_id(); + + DataFileSet deleted_files; + deleted_files.insert(deleted_data_file); + + ManifestFilterManager mgr(ManifestContent::kDeletes, file_io_); + mgr.RemoveDanglingDeletesFor(deleted_files); + + std::vector manifests{&dv_manifest}; + auto specs = SpecsById(*metadata); + ICEBERG_UNWRAP_OR_FAIL(auto schema, metadata->Schema()); + + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr.FilterManifests(schema, specs, manifests, factory)); + + ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); + ASSERT_EQ(entries.size(), 1U); + EXPECT_EQ(entries[0].status, ManifestStatus::kDeleted); +} + } // namespace iceberg diff --git a/src/iceberg/test/merging_snapshot_update_test.cc b/src/iceberg/test/merging_snapshot_update_test.cc new file mode 100644 index 000000000..366b09f25 --- /dev/null +++ b/src/iceberg/test/merging_snapshot_update_test.cc @@ -0,0 +1,736 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update/merging_snapshot_update.h" + +#include +#include +#include +#include + +#include +#include + +#include "iceberg/avro/avro_register.h" +#include "iceberg/constants.h" +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/manifest/manifest_reader.h" +#include "iceberg/manifest/manifest_writer.h" +#include "iceberg/partition_spec.h" +#include "iceberg/row/partition_values.h" +#include "iceberg/schema.h" +#include "iceberg/snapshot.h" +#include "iceberg/table.h" +#include "iceberg/table_metadata.h" +#include "iceberg/table_properties.h" +#include "iceberg/test/matchers.h" +#include "iceberg/test/update_test_base.h" +#include "iceberg/transaction.h" +#include "iceberg/update/fast_append.h" +#include "iceberg/update/update_properties.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Concrete subclass of MergingSnapshotUpdate for testing. +class TestMergeAppend : public MergingSnapshotUpdate { + public: + static Result> Make(std::string table_name, + std::shared_ptr table) { + ICEBERG_ASSIGN_OR_RAISE( + auto ctx, TransactionContext::Make(std::move(table), TransactionKind::kUpdate)); + return std::unique_ptr( + new TestMergeAppend(std::move(table_name), std::move(ctx))); + } + + std::string operation() override { return "append"; } + + // Expose protected API for test access + Status AddFile(std::shared_ptr file) { return AddDataFile(std::move(file)); } + Status AddDelete(std::shared_ptr file) { + return AddDeleteFile(std::move(file)); + } + Status RemoveDataFile(std::shared_ptr file) { + return DeleteDataFile(std::move(file)); + } + Status RemoveDeleteFile(std::shared_ptr file) { + return DeleteDeleteFile(std::move(file)); + } + Status AppendManifest(ManifestFile manifest) { + return AddManifest(std::move(manifest)); + } + Result> DataSpec() const { + return MergingSnapshotUpdate::DataSpec(); + } + void SetDataSeqNumber(int64_t seq) { SetNewDataFilesDataSequenceNumber(seq); } + + bool HasDataFiles() const { return AddsDataFiles(); } + bool HasDeleteFiles() const { return AddsDeleteFiles(); } + bool HasDataDeletes() const { return DeletesDataFiles(); } + + private: + TestMergeAppend(std::string table_name, std::shared_ptr ctx) + : MergingSnapshotUpdate(std::move(table_name), std::move(ctx)) {} +}; + +class MergingSnapshotUpdateTest : public MinimalUpdateTestBase { + protected: + static void SetUpTestSuite() { avro::RegisterAll(); } + + void SetUp() override { + MinimalUpdateTestBase::SetUp(); + + ICEBERG_UNWRAP_OR_FAIL(spec_, table_->spec()); + ICEBERG_UNWRAP_OR_FAIL(schema_, table_->schema()); + + file_a_ = MakeDataFile("/data/file_a.parquet", /*partition_x=*/1L); + file_b_ = MakeDataFile("/data/file_b.parquet", /*partition_x=*/2L); + } + + std::shared_ptr MakeDataFile(const std::string& path, int64_t partition_x) { + auto f = std::make_shared(); + f->content = DataFile::Content::kData; + f->file_path = table_location_ + path; + f->file_format = FileFormatType::kParquet; + f->partition = PartitionValues(std::vector{Literal::Long(partition_x)}); + f->file_size_in_bytes = 1024; + f->record_count = 100; + f->partition_spec_id = spec_->spec_id(); + return f; + } + + std::shared_ptr MakeDeleteFile(const std::string& path, int64_t partition_x) { + auto f = MakeDataFile(path, partition_x); + f->content = DataFile::Content::kPositionDeletes; + return f; + } + + Result> NewMergeAppend() { + return TestMergeAppend::Make(TableName(), table_); + } + + // Commit file_a_ with FastAppend and refresh the table. + void CommitFileA() { + ICEBERG_UNWRAP_OR_FAIL(auto fa, table_->NewFastAppend()); + fa->AppendFile(file_a_); + EXPECT_THAT(fa->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + } + + // Read all entries from a list of ManifestFiles. + Result> ReadAllEntries( + const std::vector& manifests, const TableMetadata& metadata) { + std::vector result; + for (const auto& m : manifests) { + ICEBERG_ASSIGN_OR_RAISE(auto spec, metadata.PartitionSpecById(m.partition_spec_id)); + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto reader, + ManifestReader::Make(m, file_io_, schema, spec)); + ICEBERG_ASSIGN_OR_RAISE(auto entries, reader->Entries()); + result.insert(result.end(), entries.begin(), entries.end()); + } + return result; + } + + // Write a manifest file containing the given data files. + // Returns a ManifestFile with added_snapshot_id = kInvalidSnapshotId so it + // is eligible for snapshot ID inheritance. + Result WriteManifest( + const std::string& path, const std::vector>& files) { + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + ManifestWriter::MakeWriter(/*format_version=*/2, kInvalidSnapshotId, path, + file_io_, spec_, schema_, ManifestContent::kData)); + for (const auto& f : files) { + ManifestEntry entry; + entry.status = ManifestStatus::kAdded; + entry.snapshot_id = std::nullopt; + entry.data_file = f; + ICEBERG_RETURN_UNEXPECTED(writer->WriteAddedEntry(entry)); + } + ICEBERG_RETURN_UNEXPECTED(writer->Close()); + return writer->ToManifestFile(); + } + + std::shared_ptr spec_; + std::shared_ptr schema_; + std::shared_ptr file_a_; + std::shared_ptr file_b_; +}; + +// ------------------------------------------------------------------------- +// State query tests +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, AddsDataFilesInitiallyFalse) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_FALSE(op->HasDataFiles()); + EXPECT_FALSE(op->HasDeleteFiles()); + EXPECT_FALSE(op->HasDataDeletes()); +} + +TEST_F(MergingSnapshotUpdateTest, AddsDataFilesTrueAfterAdd) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_TRUE(op->HasDataFiles()); + EXPECT_FALSE(op->HasDeleteFiles()); +} + +TEST_F(MergingSnapshotUpdateTest, AddsDeleteFilesTrueAfterAdd) { + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + EXPECT_FALSE(op->HasDataFiles()); + EXPECT_TRUE(op->HasDeleteFiles()); +} + +TEST_F(MergingSnapshotUpdateTest, DeletesDataFilesTrueAfterRegisterDelete) { + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + EXPECT_TRUE(op->HasDataDeletes()); +} + +// ------------------------------------------------------------------------- +// Apply / Commit tests +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, CommitNewDataFile) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at("added-data-files"), "1"); + EXPECT_EQ(snapshot->summary.at("added-records"), "100"); +} + +TEST_F(MergingSnapshotUpdateTest, CommitMultipleDataFiles) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at("added-data-files"), "2"); + EXPECT_EQ(snapshot->summary.at("added-records"), "200"); +} + +TEST_F(MergingSnapshotUpdateTest, CommitDataFileAndDeleteFile) { + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + // Data file summary + EXPECT_EQ(snapshot->summary.at("added-data-files"), "1"); +} + +TEST_F(MergingSnapshotUpdateTest, CommitPreservesExistingManifests) { + // First append: file_a + CommitFileA(); + + // Second merge append: file_b + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + // Both data files should be visible — 1 existing + 1 new + EXPECT_EQ(snapshot->summary.at("total-data-files"), "2"); +} + +TEST_F(MergingSnapshotUpdateTest, CommitDeletesDataFile) { + CommitFileA(); + + // Remove file_a via merging snapshot update + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at("total-data-files"), "0"); + EXPECT_EQ(snapshot->summary.at("deleted-data-files"), "1"); +} + +TEST_F(MergingSnapshotUpdateTest, SetNewDataFilesDataSequenceNumber) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + op->SetDataSeqNumber(42); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at("added-data-files"), "1"); +} + +// ------------------------------------------------------------------------- +// CleanUncommitted test +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, CleanUncommittedAfterSuccessfulCommitDoesNotCrash) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + // Simulate a caller invoking CleanUncommitted after a commit (e.g. cleanup + // in an error handler that runs regardless of success). Passing an empty set + // means no manifests are considered committed, so CleanUncommitted attempts + // to delete all written manifests. This should not crash. + op->CleanUncommitted({}); +} + +// ------------------------------------------------------------------------- +// Delete file summary tests +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, CommitDeleteFileSummaryHasAddedDeleteFiles) { + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDeleteFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedPosDeleteFiles), "1"); + EXPECT_EQ(snapshot->summary.count(SnapshotSummaryFields::kRemovedDeleteFiles), 0); +} + +// Covers the bug where deleted delete files were not tracked in the snapshot summary. +TEST_F(MergingSnapshotUpdateTest, CommitDeletesDeleteFileSummaryHasRemovedDeleteFiles) { + // Step 1: commit a delete file. + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + } + + // Step 2: commit a new snapshot that removes the delete file. + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->RemoveDeleteFile(del_file), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kRemovedDeleteFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kRemovedPosDeleteFiles), "1"); + EXPECT_EQ(snapshot->summary.count(SnapshotSummaryFields::kAddedDeleteFiles), 0); +} + +// ------------------------------------------------------------------------- +// Deduplication test +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, DuplicateDataFileOnlyCountedOnce) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); // duplicate — should be ignored + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "1"); +} + +// ------------------------------------------------------------------------- +// ValidateNewDeleteFile format version tests +// ------------------------------------------------------------------------- + +/// \brief V1-table test fixture — deletes are not supported in format v1. +class MergingSnapshotUpdateV1Test : public UpdateTestBase { + protected: + std::string MetadataResource() const override { return "TableMetadataV1Valid.json"; } + std::string TableName() const override { return "v1_test_table"; } + + void SetUp() override { + UpdateTestBase::SetUp(); + ICEBERG_UNWRAP_OR_FAIL(spec_, table_->spec()); + } + + std::shared_ptr MakeDeleteFile(const std::string& path) { + auto f = std::make_shared(); + f->content = DataFile::Content::kPositionDeletes; + f->file_path = table_location_ + path; + f->file_format = FileFormatType::kParquet; + f->partition = PartitionValues(std::vector{Literal::Long(1L)}); + f->file_size_in_bytes = 512; + f->record_count = 10; + f->partition_spec_id = spec_->spec_id(); + return f; + } + + Result> NewMergeAppend() { + return TestMergeAppend::Make(TableName(), table_); + } + + std::shared_ptr spec_; +}; + +TEST_F(MergingSnapshotUpdateV1Test, ValidateNewDeleteFileV1Rejected) { + auto del_file = MakeDeleteFile("/delete/del_a.parquet"); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNewDeleteFileV2RejectsDeletionVector) { + // Position delete with referenced_data_file set = deletion vector, not allowed in v2. + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + del_file->referenced_data_file = table_location_ + "/data/file_a.parquet"; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNewDeleteFileV2AllowsEqualityDelete) { + auto eq_del = MakeDeleteFile("/delete/eq_del.parquet", 1L); + eq_del->content = DataFile::Content::kEqualityDeletes; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(eq_del), IsOk()); +} + +// ------------------------------------------------------------------------- +// AddManifest — invalid manifest rejection +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsDeleteManifest) { + // Build a ManifestFile with content = kDeletes + ManifestFile del_manifest; + del_manifest.manifest_path = table_location_ + "/metadata/del.avro"; + del_manifest.content = ManifestContent::kDeletes; + del_manifest.added_snapshot_id = kInvalidSnapshotId; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(del_manifest), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsManifestWithExistingFiles) { + // Construct a ManifestFile that reports existing files without writing to disk. + ManifestFile manifest; + manifest.manifest_path = table_location_ + "/metadata/existing.avro"; + manifest.content = ManifestContent::kData; + manifest.added_snapshot_id = kInvalidSnapshotId; + manifest.existing_files_count = 1; // has_existing_files() returns true + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(manifest), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsManifestWithDeletedFiles) { + ManifestFile manifest; + manifest.manifest_path = table_location_ + "/metadata/deleted.avro"; + manifest.content = ManifestContent::kData; + manifest.added_snapshot_id = kInvalidSnapshotId; + manifest.deleted_files_count = 1; // has_deleted_files() returns true + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(manifest), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsManifestWithAssignedSnapshotId) { + ManifestFile manifest; + manifest.manifest_path = table_location_ + "/metadata/snap.avro"; + manifest.content = ManifestContent::kData; + manifest.added_snapshot_id = 12345; // already assigned + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(manifest), IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsManifestWithFirstRowId) { + ManifestFile manifest; + manifest.manifest_path = table_location_ + "/metadata/rowid.avro"; + manifest.content = ManifestContent::kData; + manifest.added_snapshot_id = kInvalidSnapshotId; + manifest.first_row_id = 0; // assigned first_row_id + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(manifest), IsError(ErrorKind::kInvalidArgument)); +} + +// ------------------------------------------------------------------------- +// AddManifest — basic commit (inherit path: v2 with can_inherit_snapshot_id) +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, AppendManifestEmptyTable) { + auto path = table_location_ + "/metadata/input.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto manifest, WriteManifest(path, {file_a_, file_b_})); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + + // In v2 with snapshot ID inheritance, the manifest path is reused directly. + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, + SnapshotCache(snapshot.get()).DataManifests(file_io_)); + ASSERT_EQ(data_manifests.size(), 1); + + EXPECT_EQ(snapshot->summary.at("added-data-files"), "2"); + EXPECT_EQ(snapshot->summary.at("total-data-files"), "2"); +} + +TEST_F(MergingSnapshotUpdateTest, AppendManifestWithDataFiles) { + // Mix AddDataFile + AddManifest — should produce 2 manifests. + auto path = table_location_ + "/metadata/input.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto manifest, WriteManifest(path, {file_a_, file_b_})); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); // file_b_ staged directly + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, + SnapshotCache(snapshot.get()).DataManifests(file_io_)); + // Written manifest (file_b_) + appended manifest (file_a_, file_b_) + EXPECT_EQ(data_manifests.size(), 2); + EXPECT_EQ(snapshot->summary.at("added-data-files"), "3"); +} + +// ------------------------------------------------------------------------- +// AddManifest — merge behavior +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, AppendManifestMergeWithMinCountOne) { + // Set min-count-to-merge = 1 so all manifests are merged. + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestMinMergeCount.key()), "1"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + auto path = table_location_ + "/metadata/input.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto manifest, WriteManifest(path, {file_a_, file_b_})); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, + SnapshotCache(snapshot.get()).DataManifests(file_io_)); + // Both manifests merged into one. + EXPECT_EQ(data_manifests.size(), 1); + EXPECT_EQ(snapshot->summary.at("added-data-files"), "3"); +} + +TEST_F(MergingSnapshotUpdateTest, AppendManifestDoNotMergeMinCount) { + // Set min-count-to-merge = 4 so 3 manifests are not merged. + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestMinMergeCount.key()), "4"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + auto path1 = table_location_ + "/metadata/m1.avro"; + auto path2 = table_location_ + "/metadata/m2.avro"; + auto path3 = table_location_ + "/metadata/m3.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(path1, {file_a_})); + ICEBERG_UNWRAP_OR_FAIL(auto m2, WriteManifest(path2, {file_b_})); + ICEBERG_UNWRAP_OR_FAIL( + auto m3, WriteManifest(path3, {MakeDataFile("/data/file_c.parquet", 3L)})); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AppendManifest(m1), IsOk()); + EXPECT_THAT(op->AppendManifest(m2), IsOk()); + EXPECT_THAT(op->AppendManifest(m3), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, + SnapshotCache(snapshot.get()).DataManifests(file_io_)); + // Below min-count-to-merge threshold — all 3 pass through unchanged. + EXPECT_EQ(data_manifests.size(), 3); + EXPECT_EQ(snapshot->summary.at("added-data-files"), "3"); +} + +// ------------------------------------------------------------------------- +// Manifest merge — data files only +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, ManifestMergeMergesIntoOne) { + // Set min-count-to-merge = 1 so every append triggers a merge. + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestMinMergeCount.key()), "1"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + // Snapshot 1: file_a_ + CommitFileA(); + + // Snapshot 2: file_b_ — should merge with existing manifest. + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, + SnapshotCache(snapshot.get()).DataManifests(file_io_)); + EXPECT_EQ(data_manifests.size(), 1); + EXPECT_EQ(snapshot->summary.at("total-data-files"), "2"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsReplaced), "1"); +} + +TEST_F(MergingSnapshotUpdateTest, ManifestMergeDoesNotMergeWhenBelowMinCount) { + // Default min-count-to-merge = 100, so manifests are not merged. + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, + SnapshotCache(snapshot.get()).DataManifests(file_io_)); + EXPECT_EQ(data_manifests.size(), 2); + EXPECT_EQ(snapshot->summary.at("total-data-files"), "2"); +} + +TEST_F(MergingSnapshotUpdateTest, ManifestMergeDoesNotMergeWhenSizeTargetTooSmall) { + // Set a tiny size target so manifests never merge. + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestTargetSizeBytes.key()), "10"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, + SnapshotCache(snapshot.get()).DataManifests(file_io_)); + EXPECT_EQ(data_manifests.size(), 2); +} + +// ------------------------------------------------------------------------- +// Manifest count summary +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, SummaryManifestCountsOnFirstCommit) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsCreated), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsReplaced), "0"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsKept), "0"); +} + +TEST_F(MergingSnapshotUpdateTest, SummaryManifestCountsOnSecondCommitNoMerge) { + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + // 1 new manifest created, 1 existing manifest kept, 0 replaced. + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsCreated), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsReplaced), "0"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsKept), "1"); +} + +TEST_F(MergingSnapshotUpdateTest, SummaryManifestCountsAfterMerge) { + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestMinMergeCount.key()), "1"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + // 1 merged output created, 1 existing manifest replaced, 0 kept. + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsCreated), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsReplaced), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsKept), "0"); +} + +TEST_F(MergingSnapshotUpdateTest, SummaryManifestCountsAfterDelete) { + ICEBERG_UNWRAP_OR_FAIL(auto props, table_->NewUpdateProperties()); + props->Set(std::string(TableProperties::kManifestMinMergeCount.key()), "1"); + EXPECT_THAT(props->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + + CommitFileA(); + + // Delete file_a_ — filter manager rewrites the manifest. + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + // Filter rewrites 1 manifest (replaced), merge produces 1 output (created). + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsReplaced), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsCreated), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsKept), "0"); +} + +// ------------------------------------------------------------------------- +// DataSpec — multiple partition specs +// ------------------------------------------------------------------------- + +TEST_F(MergingSnapshotUpdateTest, DataSpecThrowsWithMultipleSpecs) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + // file_a_ and file_b_ both use spec_id 0 — DataSpec() should succeed. + EXPECT_THAT(op->AddFile(file_a_), IsOk()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->DataSpec(), IsOk()); +} + +TEST_F(MergingSnapshotUpdateTest, DataSpecThrowsWhenEmpty) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + // No files added — DataSpec() should fail. + EXPECT_THAT(op->DataSpec(), IsError(ErrorKind::kInvalidArgument)); +} + +} // namespace iceberg diff --git a/src/iceberg/update/merging_snapshot_update.cc b/src/iceberg/update/merging_snapshot_update.cc new file mode 100644 index 000000000..f2d0c93e0 --- /dev/null +++ b/src/iceberg/update/merging_snapshot_update.cc @@ -0,0 +1,870 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update/merging_snapshot_update.h" + +#include +#include +#include +#include + +#include "iceberg/constants.h" +#include "iceberg/delete_file_index.h" +#include "iceberg/expression/expressions.h" +#include "iceberg/expression/inclusive_metrics_evaluator.h" +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/manifest/manifest_list.h" +#include "iceberg/manifest/manifest_reader.h" +#include "iceberg/manifest/manifest_util_internal.h" +#include "iceberg/manifest/manifest_writer.h" +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" +#include "iceberg/snapshot.h" +#include "iceberg/table.h" +#include "iceberg/table_metadata.h" +#include "iceberg/table_properties.h" +#include "iceberg/transaction.h" +#include "iceberg/util/macros.h" +#include "iceberg/util/snapshot_util_internal.h" + +namespace iceberg { + +MergingSnapshotUpdate::MergingSnapshotUpdate(std::string table_name, + std::shared_ptr ctx) + : SnapshotUpdate(std::move(ctx)), + table_name_(std::move(table_name)), + delete_expression_(Expressions::AlwaysFalse()), + data_filter_manager_(ManifestContent::kData, ctx_->table->io()), + delete_filter_manager_(ManifestContent::kDeletes, ctx_->table->io()), + data_merge_manager_( + base().properties.Get(TableProperties::kManifestTargetSizeBytes), + base().properties.Get(TableProperties::kManifestMinMergeCount), + base().properties.Get(TableProperties::kManifestMergeEnabled)), + delete_merge_manager_( + base().properties.Get(TableProperties::kManifestTargetSizeBytes), + base().properties.Get(TableProperties::kManifestMinMergeCount), + base().properties.Get(TableProperties::kManifestMergeEnabled)) {} + +// ------------------------------------------------------------------------- +// Primitive API +// ------------------------------------------------------------------------- + +Status MergingSnapshotUpdate::AddDataFile(std::shared_ptr file) { + if (!file) { + return InvalidArgument("Cannot add a null data file"); + } + if (!file->partition_spec_id.has_value()) { + return InvalidArgument("Data file must have a partition spec ID"); + } + + int32_t spec_id = file->partition_spec_id.value(); + ICEBERG_ASSIGN_OR_RAISE(auto spec, base().PartitionSpecById(spec_id)); + + // Suppress first_row_id — it will be assigned by the commit, not inherited from the + // source file. + file->first_row_id = std::nullopt; + + auto& data_files = new_data_files_by_spec_[spec_id]; + auto [it, inserted] = data_files.insert(file); + if (inserted) { + has_new_data_files_ = true; + ICEBERG_RETURN_UNEXPECTED(added_data_files_summary_.AddedFile(*spec, *file)); + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateNewDeleteFile(const DataFile& file) { + if (file.content == DataFile::Content::kData) { + return InvalidArgument("Expected a delete file but got a data file: {}", + file.file_path); + } + const int8_t format_version = base().format_version; + const bool is_dv = file.referenced_data_file.has_value(); + switch (format_version) { + case 1: + return InvalidArgument("Deletes are supported in V2 and above"); + case 2: + // Position deletes must NOT be DVs in v2. + if (file.content == DataFile::Content::kPositionDeletes && is_dv) { + return InvalidArgument("Must not use DVs for position deletes in V2: {}", + file.file_path); + } + break; + default: + if (format_version >= 3) { + // Position deletes MUST be DVs in v3+. + if (file.content == DataFile::Content::kPositionDeletes && !is_dv) { + return InvalidArgument("Must use DVs for position deletes in V{}: {}", + format_version, file.file_path); + } + } else { + return InvalidArgument("Unsupported format version: {}", format_version); + } + break; + } + return {}; +} + +Status MergingSnapshotUpdate::AddDeleteFile(std::shared_ptr file) { + if (!file) { + return InvalidArgument("Cannot add a null delete file"); + } + ICEBERG_RETURN_UNEXPECTED(ValidateNewDeleteFile(*file)); + if (!file->partition_spec_id.has_value()) { + return InvalidArgument("Delete file must have a partition spec ID"); + } + ICEBERG_ASSIGN_OR_RAISE(auto spec, + base().PartitionSpecById(file->partition_spec_id.value())); + ICEBERG_RETURN_UNEXPECTED(added_delete_files_summary_.AddedFile(*spec, *file)); + has_new_delete_files_ = true; + new_delete_files_.push_back(std::move(file)); + return {}; +} + +Status MergingSnapshotUpdate::DeleteDataFile(std::shared_ptr file) { + if (!file) { + return InvalidArgument("Cannot delete a null data file"); + } + return data_filter_manager_.DeleteFile(std::move(file)); +} + +Status MergingSnapshotUpdate::DeleteDeleteFile(std::shared_ptr file) { + if (!file) { + return InvalidArgument("Cannot delete a null delete file"); + } + return delete_filter_manager_.DeleteFile(std::move(file)); +} + +void MergingSnapshotUpdate::DeleteByPath(std::string_view path) { + data_filter_manager_.DeleteFile(path); +} + +Status MergingSnapshotUpdate::DeleteByRowFilter(std::shared_ptr expr) { + // If a delete file matches the row filter, it can also be removed because the rows + // it references will also be deleted. Both filter managers receive the expression. + delete_expression_ = expr; + ICEBERG_RETURN_UNEXPECTED(data_filter_manager_.DeleteByRowFilter(expr)); + return delete_filter_manager_.DeleteByRowFilter(std::move(expr)); +} + +void MergingSnapshotUpdate::DropPartition(int32_t spec_id, PartitionValues partition) { + // Dropping data in a partition also drops all delete files in that partition. + data_filter_manager_.DropPartition(spec_id, partition); + delete_filter_manager_.DropPartition(spec_id, std::move(partition)); +} + +void MergingSnapshotUpdate::FailMissingDeletePaths() { + data_filter_manager_.FailMissingDeletePaths(); + delete_filter_manager_.FailMissingDeletePaths(); +} + +void MergingSnapshotUpdate::FailAnyDelete() { + data_filter_manager_.FailAnyDelete(); + delete_filter_manager_.FailAnyDelete(); +} + +void MergingSnapshotUpdate::SetNewDataFilesDataSequenceNumber(int64_t sequence_number) { + new_data_files_data_seq_number_ = sequence_number; +} + +void MergingSnapshotUpdate::CaseSensitive(bool case_sensitive) { + case_sensitive_ = case_sensitive; + data_filter_manager_.CaseSensitive(case_sensitive); + delete_filter_manager_.CaseSensitive(case_sensitive); +} + +void MergingSnapshotUpdate::Set(const std::string& property, const std::string& value) { + summary_builder().Set(property, value); +} + +Result> MergingSnapshotUpdate::DataSpec() const { + if (new_data_files_by_spec_.empty()) { + return InvalidArgument("DataSpec() called before any data file was added"); + } + if (new_data_files_by_spec_.size() > 1) { + return InvalidArgument( + "DataSpec() requires exactly one partition spec; got {} different specs", + new_data_files_by_spec_.size()); + } + return base().PartitionSpecById(new_data_files_by_spec_.begin()->first); +} + +std::vector> MergingSnapshotUpdate::AddedDataFiles() const { + std::vector> result; + for (const auto& [spec_id, files] : new_data_files_by_spec_) { + for (const auto& file : files) { + result.push_back(file); + } + } + return result; +} + +Status MergingSnapshotUpdate::AddDeleteFile(std::shared_ptr /*file*/, + int64_t /*data_sequence_number*/) { + return NotImplemented( + "AddDeleteFile with explicit data sequence number is not yet implemented"); +} + +Status MergingSnapshotUpdate::AddManifest(ManifestFile manifest) { + if (manifest.content != ManifestContent::kData) { + return InvalidArgument("Cannot append delete manifest: {}", manifest.manifest_path); + } + if (manifest.has_existing_files()) { + return InvalidArgument("Cannot append manifest with existing files: {}", + manifest.manifest_path); + } + if (manifest.has_deleted_files()) { + return InvalidArgument("Cannot append manifest with deleted files: {}", + manifest.manifest_path); + } + if (manifest.added_snapshot_id != kInvalidSnapshotId) { + return InvalidArgument("Snapshot id must be assigned during commit: {}", + manifest.manifest_path); + } + if (manifest.first_row_id.has_value()) { + return InvalidArgument("Cannot append manifest with assigned first_row_id: {}", + manifest.manifest_path); + } + + if (can_inherit_snapshot_id()) { + appended_manifests_summary_.AddedManifest(manifest); + append_manifests_.push_back(std::move(manifest)); + } else { + ICEBERG_ASSIGN_OR_RAISE(auto copied, CopyManifest(manifest)); + rewritten_append_manifests_.push_back(std::move(copied)); + } + return {}; +} + +Result MergingSnapshotUpdate::CopyManifest(const ManifestFile& manifest) { + const TableMetadata& current = base(); + ICEBERG_ASSIGN_OR_RAISE(auto schema, current.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto spec, + current.PartitionSpecById(manifest.partition_spec_id)); + std::string path = ManifestPath(); + all_written_manifests_.insert(path); + return CopyAppendManifest(manifest, ctx_->table->io(), schema, spec, SnapshotId(), path, + current.format_version, &appended_manifests_summary_); +} + +// ------------------------------------------------------------------------- +// State queries +// ------------------------------------------------------------------------- + +bool MergingSnapshotUpdate::AddsDataFiles() const { + return !new_data_files_by_spec_.empty(); +} + +bool MergingSnapshotUpdate::AddsDeleteFiles() const { return !new_delete_files_.empty(); } + +bool MergingSnapshotUpdate::DeletesDataFiles() const { + return data_filter_manager_.ContainsDeletes(); +} + +bool MergingSnapshotUpdate::DeletesDeleteFiles() const { + return delete_filter_manager_.ContainsDeletes(); +} + +// ------------------------------------------------------------------------- +// Apply pipeline +// ------------------------------------------------------------------------- + +ManifestWriterFactory MergingSnapshotUpdate::MakeTrackedWriterFactory() { + return [this](int32_t spec_id, + ManifestContent content) -> Result> { + const TableMetadata& meta = base(); + ICEBERG_ASSIGN_OR_RAISE(auto schema, meta.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto spec, meta.PartitionSpecById(spec_id)); + std::string path = ManifestPath(); + all_written_manifests_.insert(path); + return ManifestWriter::MakeWriter(meta.format_version, SnapshotId(), std::move(path), + ctx_->table->io(), std::move(spec), + std::move(schema), content); + }; +} + +Result> MergingSnapshotUpdate::WriteNewDataManifests() { + // If new files were staged after the cache was populated (commit retry), invalidate. + if (has_new_data_files_ && cached_new_data_manifests_.has_value()) { + for (const auto& m : *cached_new_data_manifests_) { + std::ignore = DeleteFile(m.manifest_path); + } + cached_new_data_manifests_.reset(); + } + + if (cached_new_data_manifests_.has_value()) { + return *cached_new_data_manifests_; + } + + std::vector result; + for (const auto& [spec_id, data_files] : new_data_files_by_spec_) { + ICEBERG_ASSIGN_OR_RAISE(auto spec, base().PartitionSpecById(spec_id)); + ICEBERG_ASSIGN_OR_RAISE( + auto written, + WriteDataManifests(data_files.as_span(), spec, new_data_files_data_seq_number_)); + for (const auto& m : written) { + all_written_manifests_.insert(m.manifest_path); + } + result.insert(result.end(), std::make_move_iterator(written.begin()), + std::make_move_iterator(written.end())); + } + + cached_new_data_manifests_ = result; + has_new_data_files_ = false; + return result; +} + +Result> MergingSnapshotUpdate::WriteNewDeleteManifests() { + // If new files were staged after the cache was populated (commit retry), invalidate. + if (has_new_delete_files_ && cached_new_delete_manifests_.has_value()) { + for (const auto& m : *cached_new_delete_manifests_) { + std::ignore = DeleteFile(m.manifest_path); + } + cached_new_delete_manifests_.reset(); + } + + if (cached_new_delete_manifests_.has_value()) { + return *cached_new_delete_manifests_; + } + + // Group delete files by partition spec ID, mirroring WriteNewDataManifests(). + std::unordered_map>> + delete_files_by_spec; + for (const auto& file : new_delete_files_) { + delete_files_by_spec[file->partition_spec_id.value()].push_back(file); + } + + std::vector result; + for (const auto& [spec_id, delete_files] : delete_files_by_spec) { + ICEBERG_ASSIGN_OR_RAISE(auto spec, base().PartitionSpecById(spec_id)); + ICEBERG_ASSIGN_OR_RAISE(auto written, + WriteDeleteManifests(std::span(delete_files), spec)); + for (const auto& m : written) { + all_written_manifests_.insert(m.manifest_path); + } + result.insert(result.end(), std::make_move_iterator(written.begin()), + std::make_move_iterator(written.end())); + } + + cached_new_delete_manifests_ = result; + has_new_delete_files_ = false; + return result; +} + +Result> MergingSnapshotUpdate::Apply( + const TableMetadata& metadata_to_update, const std::shared_ptr& snapshot) { + // Re-validate buffered delete files against the current format version. A format + // upgrade between staging and commit could make previously-valid files invalid. + for (const auto& file : new_delete_files_) { + ICEBERG_RETURN_UNEXPECTED(ValidateNewDeleteFile(*file)); + } + + // Rebuild summary from stable sub-builders so that commit retries don't double-count. + summary_builder().Clear(); + summary_builder().Merge(added_data_files_summary_); + summary_builder().Merge(added_delete_files_summary_); + summary_builder().Merge(appended_manifests_summary_); + + auto tracked_factory = MakeTrackedWriterFactory(); + + // Step 1: Filter data manifests. + ICEBERG_ASSIGN_OR_RAISE(auto filtered_data, + data_filter_manager_.FilterManifests( + metadata_to_update, snapshot, tracked_factory)); + + // Track deleted data files in the summary builder. + for (const auto& file : data_filter_manager_.FilesToBeDeleted()) { + if (!file->partition_spec_id.has_value()) { + continue; + } + ICEBERG_ASSIGN_OR_RAISE( + auto spec, metadata_to_update.PartitionSpecById(*file->partition_spec_id)); + ICEBERG_RETURN_UNEXPECTED(summary_builder().DeletedFile(*spec, *file)); + } + + // Step 2: Compute min data sequence number; set up delete filter cleanup. + // Use last_sequence_number as the initial value so that an empty filtered list + // produces a sensible minimum. Skip manifests with kUnassignedSequenceNumber — + // those are manifests written in the current Apply() call whose sequence number + // hasn't been assigned yet. If all filtered manifests are unassigned (e.g. the + // table has no pre-existing data manifests), the fallback to last_sequence_number + // is safe: any delete file with seq > 0 and seq <= last_sequence_number can no + // longer match live data rows, so cleaning them up is correct. + int64_t min_data_seq = metadata_to_update.last_sequence_number; + for (const auto& manifest : filtered_data) { + if (manifest.min_sequence_number != kUnassignedSequenceNumber) { + min_data_seq = std::min(min_data_seq, manifest.min_sequence_number); + } + } + delete_filter_manager_.DropDeleteFilesOlderThan(min_data_seq); + delete_filter_manager_.RemoveDanglingDeletesFor( + data_filter_manager_.FilesToBeDeleted()); + + // Step 3: Filter delete manifests. + ICEBERG_ASSIGN_OR_RAISE(auto filtered_deletes, + delete_filter_manager_.FilterManifests( + metadata_to_update, snapshot, tracked_factory)); + + // Track deleted delete files in the summary builder. + for (const auto& file : delete_filter_manager_.FilesToBeDeleted()) { + if (!file->partition_spec_id.has_value()) { + continue; + } + ICEBERG_ASSIGN_OR_RAISE( + auto spec, metadata_to_update.PartitionSpecById(*file->partition_spec_id)); + ICEBERG_RETURN_UNEXPECTED(summary_builder().DeletedFile(*spec, *file)); + } + + // Drop manifests with no live files — they carry no data and should not be merged + // into the new snapshot. Manifests written by the current snapshot are always kept + // regardless of live-file counts; the merge stage handles any that are empty. + int64_t snapshot_id = SnapshotId(); + auto should_keep = [snapshot_id](const ManifestFile& m) { + return m.has_added_files() || m.has_existing_files() || + m.added_snapshot_id == snapshot_id; + }; + std::erase_if(filtered_data, [&](const ManifestFile& m) { return !should_keep(m); }); + std::erase_if(filtered_deletes, [&](const ManifestFile& m) { return !should_keep(m); }); + + // Step 4: Write (or retrieve cached) new data manifests. + ICEBERG_ASSIGN_OR_RAISE(auto written_data_manifests, WriteNewDataManifests()); + + // Incorporate append manifests (from AddManifest), stamping each with the + // current snapshot ID. append_manifests_ are used directly (inherit path); + // rewritten_append_manifests_ were already copied with the snapshot ID. + std::vector new_data_manifests = std::move(written_data_manifests); + for (const auto& src : append_manifests_) { + ManifestFile m = src; + m.added_snapshot_id = snapshot_id; + new_data_manifests.push_back(std::move(m)); + } + for (const auto& src : rewritten_append_manifests_) { + ManifestFile m = src; + m.added_snapshot_id = snapshot_id; + new_data_manifests.push_back(std::move(m)); + } + + // Step 5: Write (or retrieve cached) new delete manifests. + ICEBERG_ASSIGN_OR_RAISE(auto new_delete_manifests, WriteNewDeleteManifests()); + + // Step 6: Merge data manifests. + ICEBERG_ASSIGN_OR_RAISE(auto merged_data, + data_merge_manager_.MergeManifests( + filtered_data, new_data_manifests, SnapshotId(), + metadata_to_update, ctx_->table->io(), tracked_factory)); + + // Step 7: Merge delete manifests. + ICEBERG_ASSIGN_OR_RAISE(auto merged_deletes, + delete_merge_manager_.MergeManifests( + filtered_deletes, new_delete_manifests, SnapshotId(), + metadata_to_update, ctx_->table->io(), tracked_factory)); + + std::vector result; + result.reserve(merged_data.size() + merged_deletes.size()); + result.insert(result.end(), std::make_move_iterator(merged_data.begin()), + std::make_move_iterator(merged_data.end())); + result.insert(result.end(), std::make_move_iterator(merged_deletes.begin()), + std::make_move_iterator(merged_deletes.end())); + + // Manifest count summary. + int32_t manifests_created = 0; + int32_t manifests_kept = 0; + for (const auto& m : result) { + if (m.added_snapshot_id == snapshot_id) { + ++manifests_created; + } else { + ++manifests_kept; + } + } + int32_t replaced_manifests_count = data_filter_manager_.ReplacedManifestsCount() + + delete_filter_manager_.ReplacedManifestsCount() + + data_merge_manager_.ReplacedManifestsCount() + + delete_merge_manager_.ReplacedManifestsCount(); + summary_builder().SetManifestCounts(manifests_created, manifests_kept, + replaced_manifests_count); + + return result; +} + +void MergingSnapshotUpdate::CleanUncommitted( + const std::unordered_set& committed) { + for (const auto& path : all_written_manifests_) { + if (!committed.contains(path)) { + std::ignore = DeleteFile(path); + } + } + all_written_manifests_.clear(); + cached_new_data_manifests_.reset(); + cached_new_delete_manifests_.reset(); + has_new_data_files_ = false; + has_new_delete_files_ = false; + + // rewritten_append_manifests_ are always owned by the table (copied by us), + // so delete any that were not committed. + for (const auto& m : rewritten_append_manifests_) { + if (!committed.contains(m.manifest_path)) { + std::ignore = DeleteFile(m.manifest_path); + } + } + + // append_manifests_ are only owned by the table if the commit succeeded + // (i.e., at least one manifest was committed). + if (!committed.empty()) { + for (const auto& m : append_manifests_) { + if (!committed.contains(m.manifest_path)) { + std::ignore = DeleteFile(m.manifest_path); + } + } + } +} + +std::unordered_map MergingSnapshotUpdate::Summary() { + summary_builder().SetPartitionSummaryLimit( + base().properties.Get(TableProperties::kWritePartitionSummaryLimit)); + return summary_builder().Build(); +} + +// ------------------------------------------------------------------------- +// Conflict-detection helpers +// ------------------------------------------------------------------------- + +Status MergingSnapshotUpdate::ValidateAddedDataFiles( + const TableMetadata& metadata, int64_t starting_snapshot_id, + std::shared_ptr filter, const std::shared_ptr& parent, + std::shared_ptr io, bool case_sensitive) { + if (parent == nullptr) { + return {}; + } + + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto ancestors, + SnapshotUtil::AncestorsBetween(metadata, parent->snapshot_id, + starting_snapshot_id)); + + // Build the full set of matching snapshot IDs first, then scan their manifests. + // The full set must be known before filtering manifests, since a manifest may have + // been written by a different snapshot in the ancestry range. + std::unordered_set matching_snapshot_ids; + for (const auto& snap : ancestors) { + auto op = snap->Operation(); + if (op == DataOperation::kAppend || op == DataOperation::kOverwrite) { + matching_snapshot_ids.insert(snap->snapshot_id); + } + } + + std::unique_ptr evaluator; + if (filter != nullptr) { + ICEBERG_ASSIGN_OR_RAISE( + evaluator, InclusiveMetricsEvaluator::Make(filter, *schema, case_sensitive)); + } + + for (const auto& snapshot : ancestors) { + if (!matching_snapshot_ids.contains(snapshot->snapshot_id)) { + continue; + } + auto cached = SnapshotCache(snapshot.get()); + ICEBERG_ASSIGN_OR_RAISE(auto data_manifests, cached.DataManifests(io)); + + for (const auto& manifest : data_manifests) { + if (!matching_snapshot_ids.contains(manifest.added_snapshot_id)) { + continue; + } + ICEBERG_ASSIGN_OR_RAISE(auto spec, + metadata.PartitionSpecById(manifest.partition_spec_id)); + ICEBERG_ASSIGN_OR_RAISE(auto reader, + ManifestReader::Make(manifest, io, schema, spec)); + ICEBERG_ASSIGN_OR_RAISE(auto entries, reader->Entries()); + + for (const auto& entry : entries) { + if (entry.status != ManifestStatus::kAdded) { + continue; + } + if (entry.data_file == nullptr) { + continue; + } + if (evaluator != nullptr) { + ICEBERG_ASSIGN_OR_RAISE(bool matches, evaluator->Evaluate(*entry.data_file)); + if (!matches) { + continue; + } + } + return InvalidArgument( + "Found conflicting files that can contain rows matching {}:" + " {} in snapshot {}", + filter != nullptr ? filter->ToString() : "any expression", + entry.data_file->file_path, snapshot->snapshot_id); + } + } + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateDataFilesExist( + const TableMetadata& metadata, int64_t starting_snapshot_id, + const std::unordered_set& file_paths, bool allow_deletes, + std::shared_ptr filter, const std::shared_ptr& parent, + std::shared_ptr io, bool case_sensitive) { + if (parent == nullptr || file_paths.empty()) { + return {}; + } + + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto ancestors, + SnapshotUtil::AncestorsBetween(metadata, parent->snapshot_id, + starting_snapshot_id)); + + // Build the full set of matching snapshot IDs first, then scan their manifests. + // The full set must be known before filtering manifests, since a manifest may have + // been written by a different snapshot in the ancestry range. + // Included operations: OVERWRITE and REPLACE always; DELETE when allow_deletes is + // false. + std::unordered_set matching_snapshot_ids; + for (const auto& snap : ancestors) { + auto op = snap->Operation(); + if (op == DataOperation::kOverwrite || op == DataOperation::kReplace) { + matching_snapshot_ids.insert(snap->snapshot_id); + } else if (!allow_deletes && op == DataOperation::kDelete) { + matching_snapshot_ids.insert(snap->snapshot_id); + } + } + + // Build a metrics evaluator for the conflict-detection filter, if provided. + std::unique_ptr evaluator; + if (filter != nullptr) { + ICEBERG_ASSIGN_OR_RAISE( + evaluator, InclusiveMetricsEvaluator::Make(filter, *schema, case_sensitive)); + } + + for (const auto& snapshot : ancestors) { + if (!matching_snapshot_ids.contains(snapshot->snapshot_id)) { + continue; + } + auto cached = SnapshotCache(snapshot.get()); + ICEBERG_ASSIGN_OR_RAISE(auto data_manifests, cached.DataManifests(io)); + + for (const auto& manifest : data_manifests) { + if (!matching_snapshot_ids.contains(manifest.added_snapshot_id)) { + continue; + } + ICEBERG_ASSIGN_OR_RAISE(auto spec, + metadata.PartitionSpecById(manifest.partition_spec_id)); + ICEBERG_ASSIGN_OR_RAISE(auto reader, + ManifestReader::Make(manifest, io, schema, spec)); + ICEBERG_ASSIGN_OR_RAISE(auto entries, reader->Entries()); + + for (const auto& entry : entries) { + if (entry.status != ManifestStatus::kDeleted) { + continue; + } + if (entry.data_file == nullptr) { + continue; + } + if (!file_paths.contains(entry.data_file->file_path)) { + continue; + } + if (evaluator != nullptr) { + ICEBERG_ASSIGN_OR_RAISE(bool matches, evaluator->Evaluate(*entry.data_file)); + if (!matches) { + continue; + } + } + return InvalidArgument("Cannot commit, missing data files: {} in snapshot {}", + entry.data_file->file_path, snapshot->snapshot_id); + } + } + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( + const TableMetadata& metadata, int64_t starting_snapshot_id, + const DataFileSet& replaced_files, const std::shared_ptr& parent, + std::shared_ptr io, bool ignore_equality_deletes) { + if (parent == nullptr || replaced_files.empty() || metadata.format_version < 2) { + return {}; + } + + // Build an index of delete files added since starting_snapshot_id. + // Covers both position and equality deletes; the caller controls whether + // equality deletes are ignored. + ICEBERG_ASSIGN_OR_RAISE(auto deletes, AddedDeleteFiles(metadata, starting_snapshot_id, + nullptr, nullptr, parent, io)); + + if (deletes->empty()) { + return {}; + } + + // Compute the starting sequence number for the data file check. + int64_t starting_seq = TableMetadata::kInitialSequenceNumber; + if (auto snap_result = metadata.SnapshotById(starting_snapshot_id); + snap_result.has_value()) { + starting_seq = snap_result.value()->sequence_number; + } + + for (const auto& data_file : replaced_files) { + ICEBERG_ASSIGN_OR_RAISE(auto delete_files, + deletes->ForDataFile(starting_seq, *data_file)); + if (ignore_equality_deletes) { + // Only fail on position deletes — equality deletes at higher sequence numbers + // still apply to the rewritten files and are not a conflict. + for (const auto& df : delete_files) { + if (df->content == DataFile::Content::kPositionDeletes) { + return InvalidArgument( + "Cannot commit, found new position delete for replaced data file: {}", + data_file->file_path); + } + } + } else { + if (!delete_files.empty()) { + return InvalidArgument( + "Cannot commit, found new delete for replaced data file: {}", + data_file->file_path); + } + } + } + return {}; +} + +Status MergingSnapshotUpdate::ValidateAddedDataFiles( + const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, + const PartitionSet& /*partition_set*/, const std::shared_ptr& /*parent*/, + std::shared_ptr /*io*/) { + return NotImplemented( + "ValidateAddedDataFiles with PartitionSet is not yet implemented"); +} + +Status MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( + const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, + std::shared_ptr /*data_filter*/, const DataFileSet& /*replaced_files*/, + const std::shared_ptr& /*parent*/, std::shared_ptr /*io*/) { + return NotImplemented( + "ValidateNoNewDeletesForDataFiles with data filter is not yet implemented"); +} + +Status MergingSnapshotUpdate::ValidateNoNewDeleteFiles( + const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, + std::shared_ptr /*data_filter*/, + const std::shared_ptr& /*parent*/, std::shared_ptr /*io*/) { + return NotImplemented( + "ValidateNoNewDeleteFiles with Expression is not yet implemented"); +} + +Status MergingSnapshotUpdate::ValidateNoNewDeleteFiles( + const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, + const PartitionSet& /*partition_set*/, const std::shared_ptr& /*parent*/, + std::shared_ptr /*io*/) { + return NotImplemented( + "ValidateNoNewDeleteFiles with PartitionSet is not yet implemented"); +} + +Status MergingSnapshotUpdate::ValidateDeletedDataFiles( + const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, + std::shared_ptr /*data_filter*/, + const std::shared_ptr& /*parent*/, std::shared_ptr /*io*/) { + return NotImplemented( + "ValidateDeletedDataFiles with Expression is not yet implemented"); +} + +Status MergingSnapshotUpdate::ValidateDeletedDataFiles( + const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, + const PartitionSet& /*partition_set*/, const std::shared_ptr& /*parent*/, + std::shared_ptr /*io*/) { + return NotImplemented( + "ValidateDeletedDataFiles with PartitionSet is not yet implemented"); +} + +Result> MergingSnapshotUpdate::AddedDeleteFiles( + const TableMetadata& metadata, int64_t starting_snapshot_id, + std::shared_ptr data_filter, std::shared_ptr partition_set, + const std::shared_ptr& parent, std::shared_ptr io, + bool case_sensitive) { + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + + if (parent == nullptr || metadata.format_version < 2) { + ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, + TableMetadataCache(&metadata).GetPartitionSpecsById()); + std::unordered_map> specs_by_id( + specs_ref.get().begin(), specs_ref.get().end()); + ICEBERG_ASSIGN_OR_RAISE(auto builder, DeleteFileIndex::BuilderFor( + io, schema, std::move(specs_by_id), {})); + return builder.Build(); + } + + ICEBERG_ASSIGN_OR_RAISE(auto ancestors, + SnapshotUtil::AncestorsBetween(metadata, parent->snapshot_id, + starting_snapshot_id)); + + // Collect delete manifests from OVERWRITE and DELETE snapshots only. + std::unordered_set matching_snapshot_ids; + for (const auto& snap : ancestors) { + auto op = snap->Operation(); + if (op == DataOperation::kOverwrite || op == DataOperation::kDelete) { + matching_snapshot_ids.insert(snap->snapshot_id); + } + } + + std::vector delete_manifests; + for (const auto& snapshot : ancestors) { + if (!matching_snapshot_ids.contains(snapshot->snapshot_id)) { + continue; + } + auto cached = SnapshotCache(snapshot.get()); + ICEBERG_ASSIGN_OR_RAISE(auto manifests, cached.DeleteManifests(io)); + for (const auto& m : manifests) { + if (matching_snapshot_ids.contains(m.added_snapshot_id)) { + delete_manifests.push_back(m); + } + } + } + + // Compute the starting sequence number from the starting snapshot. + int64_t starting_seq = TableMetadata::kInitialSequenceNumber; + if (auto snap_result = metadata.SnapshotById(starting_snapshot_id); + snap_result.has_value()) { + starting_seq = snap_result.value()->sequence_number; + } + + ICEBERG_ASSIGN_OR_RAISE(auto specs_ref, + TableMetadataCache(&metadata).GetPartitionSpecsById()); + std::unordered_map> specs_by_id( + specs_ref.get().begin(), specs_ref.get().end()); + + ICEBERG_ASSIGN_OR_RAISE(auto builder, + DeleteFileIndex::BuilderFor(io, schema, std::move(specs_by_id), + std::move(delete_manifests))); + builder.AfterSequenceNumber(starting_seq); + builder.CaseSensitive(case_sensitive); + if (data_filter != nullptr) { + builder.DataFilter(std::move(data_filter)); + } + if (partition_set != nullptr) { + builder.FilterPartitions(std::move(partition_set)); + } + return builder.Build(); +} + +Status MergingSnapshotUpdate::ValidateAddedDVs( + const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, + std::shared_ptr /*conflict_filter*/, + const std::shared_ptr& /*parent*/, std::shared_ptr /*io*/) { + return NotImplemented( + "ValidateAddedDVs is not yet supported (deletion vectors require format v3)"); +} + +} // namespace iceberg diff --git a/src/iceberg/update/merging_snapshot_update.h b/src/iceberg/update/merging_snapshot_update.h new file mode 100644 index 000000000..a4030bb4d --- /dev/null +++ b/src/iceberg/update/merging_snapshot_update.h @@ -0,0 +1,350 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/update/merging_snapshot_update.h + +#include +#include +#include +#include +#include +#include +#include + +#include "iceberg/delete_file_index.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/manifest/manifest_filter_manager.h" +#include "iceberg/manifest/manifest_merge_manager.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" +#include "iceberg/update/snapshot_update.h" +#include "iceberg/util/data_file_set.h" + +namespace iceberg { + +/// \brief Abstract base class for all merge-based snapshot write operations. +/// +/// Provides the complete filter → write → merge pipeline that all merge-based +/// operations (MergeAppend, OverwriteFiles, RowDelta, ReplacePartitions, +/// RewriteFiles) share. Subclasses only need to implement `operation()` and +/// call the protected primitive API to describe what changes to make. +/// +/// The Apply() pipeline: +/// 1. Filter data manifests (via data_filter_manager_) +/// 2. Compute min data sequence number and set up delete filter cleanup +/// 3. Filter delete manifests (via delete_filter_manager_) +/// 4. Write new data manifests (cached for commit retry) +/// 5. Write new delete manifests (cached for commit retry) +/// 6. Merge data manifests (via data_merge_manager_) +/// 7. Merge delete manifests (via delete_merge_manager_) +/// +class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { + public: + ~MergingSnapshotUpdate() override = default; + + // SnapshotUpdate overrides + Result> Apply( + const TableMetadata& metadata_to_update, + const std::shared_ptr& snapshot) override; + + void CleanUncommitted(const std::unordered_set& committed) override; + + std::unordered_map Summary() override; + + /// \brief Set a custom property in the snapshot summary. + void Set(const std::string& property, const std::string& value); + + protected: + /// \brief Constructor; reads merge configuration from table properties. + explicit MergingSnapshotUpdate(std::string table_name, + std::shared_ptr ctx); + + /// \brief Stage a data file to be added to the table. + Status AddDataFile(std::shared_ptr file); + + /// \brief Stage a delete file to be added to the table. + Status AddDeleteFile(std::shared_ptr file); + + /// \brief Validate a delete file against the table format version rules. + /// + /// - Format v1: deletes are not supported. + /// - Format v2: position deletes must NOT be deletion vectors (DVs). + /// - Format v3+: position deletes MUST be deletion vectors (DVs). + Status ValidateNewDeleteFile(const DataFile& file); + + /// \brief Stage a delete file with an explicit data sequence number. + /// + /// \note Not yet implemented; returns NotImplemented error. + Status AddDeleteFile(std::shared_ptr file, int64_t data_sequence_number); + + /// \brief Add all files in a pre-existing data manifest to the new snapshot. + /// + /// The manifest must contain only DATA content and only ADDED entries (no + /// existing or deleted files). If snapshot ID inheritance is enabled and the + /// manifest has no snapshot ID assigned, it is used directly; otherwise it is + /// copied with the current snapshot ID. + Status AddManifest(ManifestFile manifest); + + /// \brief Register a data file (by object) to be deleted from the table. + Status DeleteDataFile(std::shared_ptr file); + + /// \brief Register a delete file (by object) to be removed from the table. + Status DeleteDeleteFile(std::shared_ptr file); + + /// \brief Register a data file path to be deleted from the table. + /// + /// \note Only applies to data files. To remove delete files, use DeleteDeleteFile(). + void DeleteByPath(std::string_view path); + + /// \brief Register an expression to delete matching rows. + /// + /// Both data and delete filter managers receive the expression: delete files that + /// match the row filter can also be removed because those rows will be deleted. + Status DeleteByRowFilter(std::shared_ptr expr); + + /// \brief Register a partition to be dropped. + /// + /// Both data and delete filter managers receive the partition drop, since dropping + /// data in a partition also drops all delete files in that partition. + void DropPartition(int32_t spec_id, PartitionValues partition); + + /// \brief Fail if any registered delete path is not found in any manifest. + void FailMissingDeletePaths(); + + /// \brief Fail if any manifest entry matches a delete condition. + void FailAnyDelete(); + + /// \brief Override the data sequence number assigned to all newly-added data files. + void SetNewDataFilesDataSequenceNumber(int64_t sequence_number); + + /// \brief Set case sensitivity for row filter and expression evaluation. + void CaseSensitive(bool case_sensitive); + + /// \brief Returns true if case-sensitive matching is enabled (default: true). + bool IsCaseSensitive() const { return case_sensitive_; } + + /// \brief Returns true if any data files have been staged for addition. + bool AddsDataFiles() const; + + /// \brief Returns true if any delete files have been staged for addition. + bool AddsDeleteFiles() const; + + /// \brief Returns true if any data files have been registered for deletion. + bool DeletesDataFiles() const; + + /// \brief Returns true if any delete files have been registered for removal. + bool DeletesDeleteFiles() const; + + /// \brief Returns the row-filter expression set via DeleteByRowFilter, or nullptr. + const std::shared_ptr& RowFilter() const { return delete_expression_; } + + /// \brief Returns the single partition spec for all staged data files. + /// + /// Precondition: exactly one partition spec ID must be represented among staged + /// data files. + Result> DataSpec() const; + + /// \brief Returns all data files staged for addition. + std::vector> AddedDataFiles() const; + + /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] + /// added a data file matching the given filter expression. + static Status ValidateAddedDataFiles(const TableMetadata& metadata, + int64_t starting_snapshot_id, + std::shared_ptr filter, + const std::shared_ptr& parent, + std::shared_ptr io, + bool case_sensitive = true); + + /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] + /// added a data file in any partition of the given partition set. + /// + /// \note Not yet implemented; returns NotImplemented error. + static Status ValidateAddedDataFiles(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io); + + /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] + /// removed a file whose path is in file_paths (and allow_deletes is false). + static Status ValidateDataFilesExist( + const TableMetadata& metadata, int64_t starting_snapshot_id, + const std::unordered_set& file_paths, bool allow_deletes, + std::shared_ptr filter, const std::shared_ptr& parent, + std::shared_ptr io, bool case_sensitive = true); + + /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] + /// added a delete file that covers a file in replaced_files. + /// + /// Whether equality deletes are checked is derived automatically from whether + /// a custom data sequence number was set via SetNewDataFilesDataSequenceNumber(): + /// if set, equality deletes are ignored because they still apply to the rewritten + /// files and are not a conflict. + /// + /// Subclasses should prefer this overload over the static one. + Status ValidateNoNewDeletesForDataFiles(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const DataFileSet& replaced_files, + const std::shared_ptr& parent, + std::shared_ptr io) const { + return ValidateNoNewDeletesForDataFiles(metadata, starting_snapshot_id, + replaced_files, parent, io, + new_data_files_data_seq_number_.has_value()); + } + + /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] + /// added a delete file that covers a file in replaced_files. + /// + /// \param ignore_equality_deletes If true, only position deletes are checked. + /// Set to true when replaced data files have the same sequence number as the + /// new files (e.g. RewriteFiles), so equality deletes at higher sequence numbers + /// still apply and are not a conflict. + static Status ValidateNoNewDeletesForDataFiles(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const DataFileSet& replaced_files, + const std::shared_ptr& parent, + std::shared_ptr io, + bool ignore_equality_deletes = false); + + /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] + /// added a delete file matching the data filter that covers a file in replaced_files. + /// + /// \note Not yet implemented; returns NotImplemented error. + static Status ValidateNoNewDeletesForDataFiles(const TableMetadata& metadata, + int64_t starting_snapshot_id, + std::shared_ptr data_filter, + const DataFileSet& replaced_files, + const std::shared_ptr& parent, + std::shared_ptr io); + + /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] + /// added a delete file matching the given row filter. + /// + /// \note Not yet implemented; returns NotImplemented error. + static Status ValidateNoNewDeleteFiles(const TableMetadata& metadata, + int64_t starting_snapshot_id, + std::shared_ptr data_filter, + const std::shared_ptr& parent, + std::shared_ptr io); + + /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] + /// added a delete file matching any partition in the given partition set. + /// + /// \note Not yet implemented; returns NotImplemented error. + static Status ValidateNoNewDeleteFiles(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io); + + /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] + /// deleted a data file matching the given row filter. + /// + /// \note Not yet implemented; returns NotImplemented error. + static Status ValidateDeletedDataFiles(const TableMetadata& metadata, + int64_t starting_snapshot_id, + std::shared_ptr data_filter, + const std::shared_ptr& parent, + std::shared_ptr io); + + /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] + /// deleted a data file in any partition of the given partition set. + /// + /// \note Not yet implemented; returns NotImplemented error. + static Status ValidateDeletedDataFiles(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io); + + /// \brief Build a DeleteFileIndex of delete files added since starting_snapshot_id. + static Result> AddedDeleteFiles( + const TableMetadata& metadata, int64_t starting_snapshot_id, + std::shared_ptr data_filter, + std::shared_ptr partition_set, + const std::shared_ptr& parent, std::shared_ptr io, + bool case_sensitive = true); + + /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] + /// added a deletion vector that conflicts with DVs being written. + /// + /// \note Deletion vectors (format v3) are not yet supported; returns NotImplemented. + static Status ValidateAddedDVs(const TableMetadata& metadata, + int64_t starting_snapshot_id, + std::shared_ptr conflict_filter, + const std::shared_ptr& parent, + std::shared_ptr io); + + private: + /// \brief Create a ManifestWriterFactory that records every path it creates in + /// all_written_manifests_. + ManifestWriterFactory MakeTrackedWriterFactory(); + + /// \brief Copy a manifest with the current snapshot ID, for use when snapshot + /// ID inheritance is not possible. + Result CopyManifest(const ManifestFile& manifest); + + /// \brief Write new data manifests for staged data files; caches the result. + Result> WriteNewDataManifests(); + + /// \brief Write new delete manifests for staged delete files; caches the result. + Result> WriteNewDeleteManifests(); + + // Used for commit event notifications and diagnostic log messages. + std::string table_name_; + std::shared_ptr delete_expression_; + bool case_sensitive_ = true; + + // Stable sub-builders for added files — accumulated across retries and merged + // into summary_builder_ at the start of each Apply() call. + SnapshotSummaryBuilder added_data_files_summary_; + SnapshotSummaryBuilder added_delete_files_summary_; + SnapshotSummaryBuilder appended_manifests_summary_; + + ManifestFilterManager data_filter_manager_; + ManifestFilterManager delete_filter_manager_; + ManifestMergeManager data_merge_manager_; + ManifestMergeManager delete_merge_manager_; + + std::unordered_map new_data_files_by_spec_; + std::vector> new_delete_files_; + std::optional new_data_files_data_seq_number_; + + // Manifests passed via AddManifest(): inherit path (no copy needed) and + // rewrite path (must be copied with the current snapshot ID). + std::vector append_manifests_; + std::vector rewritten_append_manifests_; + + // Set to true when new files are staged after the cache was populated, so the + // cache is invalidated and re-written on the next Apply() call (commit retry). + bool has_new_data_files_ = false; + bool has_new_delete_files_ = false; + + std::optional> cached_new_data_manifests_; + std::optional> cached_new_delete_manifests_; + + /// Tracks every manifest path created via MakeTrackedWriterFactory, plus the + /// paths in cached_new_*_manifests_. Used by CleanUncommitted(). + std::unordered_set all_written_manifests_; +}; + +} // namespace iceberg diff --git a/src/iceberg/update/snapshot_update.cc b/src/iceberg/update/snapshot_update.cc index a59ebdc72..fc5359e42 100644 --- a/src/iceberg/update/snapshot_update.cc +++ b/src/iceberg/update/snapshot_update.cc @@ -41,7 +41,7 @@ namespace iceberg { namespace { -// The Java impl skips updating total if parsing fails. Here we choose to be strict. +// Skips updating total if parsing fails would be lenient; here we choose to be strict. Status UpdateTotal(std::unordered_map& summary, const std::unordered_map& previous_summary, const std::string& total_property, const std::string& added_property, @@ -398,14 +398,10 @@ void SnapshotUpdate::CleanAll() { } Status SnapshotUpdate::DeleteFile(const std::string& path) { - static const auto kDefaultDeleteFunc = [this](const std::string& path) { - return this->ctx_->table->io()->DeleteFile(path); - }; if (delete_func_) { return delete_func_(path); - } else { - return kDefaultDeleteFunc(path); } + return ctx_->table->io()->DeleteFile(path); } std::string SnapshotUpdate::ManifestListPath() { diff --git a/src/iceberg/update/update_partition_statistics.h b/src/iceberg/update/update_partition_statistics.h index 982b1bd39..86bf1b081 100644 --- a/src/iceberg/update/update_partition_statistics.h +++ b/src/iceberg/update/update_partition_statistics.h @@ -65,9 +65,8 @@ class ICEBERG_EXPORT UpdatePartitionStatistics : public PendingUpdate { /// \brief Partition statistics updates are intentionally not retried today. /// - /// This matches the current Java `SetPartitionStatistics` behavior, which commits - /// directly without a retry loop. Keep this conservative until we add explicit replay - /// coverage for this update type. + /// Commits directly without a retry loop. Keep this conservative until we add + /// explicit replay coverage for this update type. bool IsRetryable() const override { return false; } struct ApplyResult { From a23819521c700ca84e524061e850c0ea835db8d3 Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Wed, 27 May 2026 14:54:03 +0800 Subject: [PATCH 2/4] Align merging snapshot update with Java parity Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../manifest/manifest_filter_manager.cc | 18 +- .../manifest/manifest_filter_manager.h | 20 +- .../manifest/manifest_merge_manager.cc | 8 +- .../test/manifest_filter_manager_test.cc | 36 +- .../test/manifest_merge_manager_test.cc | 28 + .../test/merging_snapshot_update_test.cc | 391 +++++++++++++- src/iceberg/test/snapshot_util_test.cc | 14 +- src/iceberg/update/merging_snapshot_update.cc | 482 ++++++++++++------ src/iceberg/update/merging_snapshot_update.h | 26 +- src/iceberg/update/snapshot_update.cc | 20 +- src/iceberg/update/snapshot_update.h | 9 + src/iceberg/util/snapshot_util.cc | 4 +- src/iceberg/util/snapshot_util_internal.h | 12 +- 13 files changed, 827 insertions(+), 241 deletions(-) diff --git a/src/iceberg/manifest/manifest_filter_manager.cc b/src/iceberg/manifest/manifest_filter_manager.cc index 2b53158bc..6630b9fd4 100644 --- a/src/iceberg/manifest/manifest_filter_manager.cc +++ b/src/iceberg/manifest/manifest_filter_manager.cc @@ -94,7 +94,6 @@ void ManifestFilterManager::DeleteFile(std::string_view path) { Status ManifestFilterManager::DeleteFile(std::shared_ptr file) { ICEBERG_PRECHECK(file != nullptr, "Cannot delete file: null"); delete_paths_.insert(file->file_path); - delete_files_.insert(std::move(file)); return {}; } @@ -132,9 +131,7 @@ Result ManifestFilterManager::CanContainDroppedFiles(const ManifestFile&) // manifests once object-delete partitions are tracked separately. // Currently, DeleteFile(std::shared_ptr) degrades to a path-based delete, // which forces scanning all manifests. - // Also open delete manifests when a minimum sequence number is set for cleanup. - return !delete_paths_.empty() || !removed_data_file_paths_.empty() || - (manifest_content_ == ManifestContent::kDeletes && min_sequence_number_ > 0); + return !delete_paths_.empty() || !removed_data_file_paths_.empty(); } Result ManifestFilterManager::CanContainDroppedPartitions( @@ -385,6 +382,16 @@ Status ManifestFilterManager::ValidateRequiredDeletes( Result> ManifestFilterManager::FilterManifests( const TableMetadata& metadata, const std::shared_ptr& base_snapshot, const ManifestWriterFactory& writer_factory) { + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + return FilterManifests(schema, metadata, base_snapshot, writer_factory); +} + +Result> ManifestFilterManager::FilterManifests( + const std::shared_ptr& schema, const TableMetadata& metadata, + const std::shared_ptr& base_snapshot, + const ManifestWriterFactory& writer_factory) { + delete_files_.clear(); + replaced_manifests_count_ = 0; if (!base_snapshot) { ICEBERG_RETURN_UNEXPECTED(ValidateRequiredDeletes({})); return std::vector{}; @@ -402,7 +409,6 @@ Result> ManifestFilterManager::FilterManifests( manifests.push_back(&manifest); } - ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); TableMetadataCache metadata_cache(&metadata); ICEBERG_ASSIGN_OR_RAISE(auto specs_by_id, metadata_cache.GetPartitionSpecsById()); @@ -426,7 +432,9 @@ Result> ManifestFilterManager::FilterManifests( } std::unordered_set found_paths; + delete_files_.clear(); if (manifests.empty()) { + replaced_manifests_count_ = 0; ICEBERG_RETURN_UNEXPECTED(ValidateRequiredDeletes(found_paths)); return std::vector{}; } diff --git a/src/iceberg/manifest/manifest_filter_manager.h b/src/iceberg/manifest/manifest_filter_manager.h index 981b9ac3b..e319623b0 100644 --- a/src/iceberg/manifest/manifest_filter_manager.h +++ b/src/iceberg/manifest/manifest_filter_manager.h @@ -85,19 +85,16 @@ class ICEBERG_EXPORT ManifestFilterManager { /// \brief Register a file object for deletion. /// /// Any manifest entry whose file_path matches file->file_path will be marked - /// DELETED. The file object is retained in FilesToBeDeleted(), allowing callers - /// to enumerate deleted file objects for follow-up delete-file cleanup. - /// Duplicate registrations (same path) are silently ignored. + /// DELETED. Duplicate registrations (same path) are silently ignored. /// /// \param file The data/delete file to delete (must not be null) Status DeleteFile(std::shared_ptr file); /// \brief Returns the set of file objects marked for deletion by this manager. /// - /// This includes files registered via DeleteFile(DataFile) and files discovered - /// during FilterManifests() that were deleted by path, partition, or row-filter - /// matching. Used by higher-level operations (e.g. RowDelta) to enumerate the - /// deleted data files for delete-file cleanup. + /// This is populated by the most recent FilterManifests() call and contains only + /// files that were actually deleted from filtered manifests. Used by higher-level + /// operations (e.g. RowDelta) to enumerate deleted data files for follow-up cleanup. const DataFileSet& FilesToBeDeleted() const; /// \brief Register a partition for dropping. @@ -159,6 +156,15 @@ class ICEBERG_EXPORT ManifestFilterManager { const TableMetadata& metadata, const std::shared_ptr& base_snapshot, const ManifestWriterFactory& writer_factory); + /// \brief Apply all accumulated delete conditions using an explicit schema. + /// + /// This overload is used when callers need row-filter evaluation bound against a + /// schema other than metadata.Schema(), such as the schema at a branch head. + Result> FilterManifests( + const std::shared_ptr& schema, const TableMetadata& metadata, + const std::shared_ptr& base_snapshot, + const ManifestWriterFactory& writer_factory); + /// \brief Apply all accumulated delete conditions to the provided manifests. /// /// This overload accepts only the context needed for filtering. It is intended for diff --git a/src/iceberg/manifest/manifest_merge_manager.cc b/src/iceberg/manifest/manifest_merge_manager.cc index aedcea735..e29a96846 100644 --- a/src/iceberg/manifest/manifest_merge_manager.cc +++ b/src/iceberg/manifest/manifest_merge_manager.cc @@ -142,8 +142,12 @@ Result> ManifestMergeManager::MergeGroup( } else { ICEBERG_ASSIGN_OR_RAISE( auto merged, FlushBin(bin, snapshot_id, metadata, file_io, writer_factory)); - // Each manifest consumed into the merged output (beyond the 1 output) is replaced. - replaced_manifests_count_ += static_cast(bin.size()) - 1; + if (bin.size() > 1) { + replaced_manifests_count_ += static_cast(std::ranges::count_if( + bin, [snapshot_id](const ManifestFile* manifest) { + return manifest->added_snapshot_id != snapshot_id; + })); + } result.push_back(std::move(merged)); } } diff --git a/src/iceberg/test/manifest_filter_manager_test.cc b/src/iceberg/test/manifest_filter_manager_test.cc index aa1054fec..7dc4ee2d9 100644 --- a/src/iceberg/test/manifest_filter_manager_test.cc +++ b/src/iceberg/test/manifest_filter_manager_test.cc @@ -418,7 +418,7 @@ static Result WriteDeleteManifest(std::shared_ptr delete path); } -TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThan) { +TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThanDoesNotRewriteOnItsOwn) { auto* metadata = table_->metadata().get(); auto factory = MakeWriterFactory(*metadata); @@ -439,7 +439,6 @@ TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThan) { WriteDeleteManifest(del_file, /*data_seq=*/2L, file_io_, *metadata, manifest_path)); ManifestFilterManager mgr(ManifestContent::kDeletes, file_io_); - // Drop delete files older than sequence number 5: entry (seq=2) should be dropped. mgr.DropDeleteFilesOlderThan(5); std::vector manifests{&del_manifest}; @@ -449,17 +448,16 @@ TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThan) { ICEBERG_UNWRAP_OR_FAIL(auto result, mgr.FilterManifests(schema, specs, manifests, factory)); - ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); - ASSERT_EQ(entries.size(), 1U); - EXPECT_EQ(entries[0].status, ManifestStatus::kDeleted); + ASSERT_EQ(result.size(), 1U); + EXPECT_EQ(result[0].manifest_path, del_manifest.manifest_path); } -TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThanKeepsNewerEntries) { +TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThanDuringDeleteManifestRewrite) { auto* metadata = table_->metadata().get(); auto factory = MakeWriterFactory(*metadata); - // Two entries in the same manifest: old (seq=2, below threshold) and new (seq=10, - // above). + // Three entries in the same manifest: old (seq=2, below threshold), targeted + // (explicit path delete), and keep (survives the rewrite). auto make_del_file = [&](const std::string& path) { auto f = std::make_shared(); f->content = DataFile::Content::kPositionDeletes; @@ -472,16 +470,18 @@ TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThanKeepsNewerEntries) { return f; }; auto old_file = make_del_file(table_location_ + "/delete/del_old.parquet"); - auto new_file = make_del_file(table_location_ + "/delete/del_new.parquet"); + auto targeted_file = make_del_file(table_location_ + "/delete/del_targeted.parquet"); + auto keep_file = make_del_file(table_location_ + "/delete/del_keep.parquet"); auto manifest_path = std::format("{}/metadata/del-manifest-{}.avro", table_location_, manifest_counter_++); ICEBERG_UNWRAP_OR_FAIL(auto del_manifest, - WriteDeleteManifest({{old_file, 2L}, {new_file, 10L}}, file_io_, - *metadata, manifest_path)); + WriteDeleteManifest( + {{old_file, 2L}, {targeted_file, 10L}, {keep_file, 10L}}, + file_io_, *metadata, manifest_path)); ManifestFilterManager mgr(ManifestContent::kDeletes, file_io_); - // Threshold=5: old entry dropped, new entry survives as kExisting. + mgr.DeleteFile(targeted_file->file_path); mgr.DropDeleteFilesOlderThan(5); std::vector manifests{&del_manifest}; @@ -492,22 +492,22 @@ TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThanKeepsNewerEntries) { mgr.FilterManifests(schema, specs, manifests, factory)); ICEBERG_UNWRAP_OR_FAIL(auto entries, ReadAllEntries(result, *metadata)); - ASSERT_EQ(entries.size(), 2U); - // The old entry should be dropped; the new entry should survive. + ASSERT_EQ(entries.size(), 3U); auto deleted = std::count_if( entries.begin(), entries.end(), [](const ManifestEntry& e) { return e.status == ManifestStatus::kDeleted; }); auto existing = std::count_if( entries.begin(), entries.end(), [](const ManifestEntry& e) { return e.status == ManifestStatus::kExisting; }); - EXPECT_EQ(deleted, 1); + EXPECT_EQ(deleted, 2); EXPECT_EQ(existing, 1); - // Verify which entry survived. for (const auto& e : entries) { if (e.status == ManifestStatus::kExisting) { - EXPECT_EQ(e.data_file->file_path, new_file->file_path); + EXPECT_EQ(e.data_file->file_path, keep_file->file_path); } else { - EXPECT_EQ(e.data_file->file_path, old_file->file_path); + EXPECT_THAT(e.data_file->file_path, + ::testing::AnyOf(old_file->file_path, targeted_file->file_path)); + EXPECT_EQ(e.status, ManifestStatus::kDeleted); } } } diff --git a/src/iceberg/test/manifest_merge_manager_test.cc b/src/iceberg/test/manifest_merge_manager_test.cc index b19eace86..b5dd60f9e 100644 --- a/src/iceberg/test/manifest_merge_manager_test.cc +++ b/src/iceberg/test/manifest_merge_manager_test.cc @@ -192,6 +192,34 @@ TEST_F(ManifestMergeManagerTest, MergeOccursAtThreshold) { EXPECT_EQ(count1, 3); } +TEST_F(ManifestMergeManagerTest, ReplacedManifestCountTracksPreviousSnapshotInputs) { + ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); + ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); + m0.added_snapshot_id = kSnapshotId - 1; + m1.added_snapshot_id = kSnapshotId - 2; + + ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/2, /*enabled=*/true); + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr.MergeManifests({m0, m1}, {}, kSnapshotId, *metadata_, file_io_, + MakeWriterFactory())); + + EXPECT_EQ(result.size(), 1U); + EXPECT_EQ(mgr.ReplacedManifestsCount(), 2); +} + +TEST_F(ManifestMergeManagerTest, ReplacedManifestCountIgnoresCurrentSnapshotInputs) { + ICEBERG_UNWRAP_OR_FAIL(auto m0, WriteManifest(kSpecId0, 1, /*size=*/100)); + ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); + + ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/2, /*enabled=*/true); + ICEBERG_UNWRAP_OR_FAIL(auto result, + mgr.MergeManifests({}, {m0, m1}, kSnapshotId, *metadata_, file_io_, + MakeWriterFactory())); + + EXPECT_EQ(result.size(), 1U); + EXPECT_EQ(mgr.ReplacedManifestsCount(), 0); +} + TEST_F(ManifestMergeManagerTest, OversizedManifestPassedThrough) { // m_large exceeds target → must not be merged; m_small fits ICEBERG_UNWRAP_OR_FAIL(auto m_large, WriteManifest(kSpecId0, 2, /*size=*/2000)); diff --git a/src/iceberg/test/merging_snapshot_update_test.cc b/src/iceberg/test/merging_snapshot_update_test.cc index 366b09f25..d436b7c53 100644 --- a/src/iceberg/test/merging_snapshot_update_test.cc +++ b/src/iceberg/test/merging_snapshot_update_test.cc @@ -29,6 +29,7 @@ #include "iceberg/avro/avro_register.h" #include "iceberg/constants.h" +#include "iceberg/expression/expressions.h" #include "iceberg/manifest/manifest_entry.h" #include "iceberg/manifest/manifest_reader.h" #include "iceberg/manifest/manifest_writer.h" @@ -66,6 +67,9 @@ class TestMergeAppend : public MergingSnapshotUpdate { Status AddDelete(std::shared_ptr file) { return AddDeleteFile(std::move(file)); } + Status AddDelete(std::shared_ptr file, int64_t data_sequence_number) { + return AddDeleteFile(std::move(file), data_sequence_number); + } Status RemoveDataFile(std::shared_ptr file) { return DeleteDataFile(std::move(file)); } @@ -78,7 +82,59 @@ class TestMergeAppend : public MergingSnapshotUpdate { Result> DataSpec() const { return MergingSnapshotUpdate::DataSpec(); } + int64_t GeneratedSnapshotId() { return SnapshotId(); } void SetDataSeqNumber(int64_t seq) { SetNewDataFilesDataSequenceNumber(seq); } + static Status ValidateAddedDataFilesForTest(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateAddedDataFiles(metadata, starting_snapshot_id, + nullptr, parent, std::move(io)); + } + static Status ValidateAddedDataFilesForTest(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateAddedDataFiles( + metadata, starting_snapshot_id, partition_set, parent, std::move(io)); + } + static Status ValidateNoNewDeletesForDataFilesForTest( + const TableMetadata& metadata, int64_t starting_snapshot_id, + std::shared_ptr data_filter, const DataFileSet& replaced_files, + const std::shared_ptr& parent, std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( + metadata, starting_snapshot_id, std::move(data_filter), replaced_files, parent, + std::move(io)); + } + static Status ValidateNoNewDeleteFilesForTest( + const TableMetadata& metadata, int64_t starting_snapshot_id, + std::shared_ptr data_filter, const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateNoNewDeleteFiles( + metadata, starting_snapshot_id, std::move(data_filter), parent, std::move(io)); + } + static Status ValidateNoNewDeleteFilesForTest( + const TableMetadata& metadata, int64_t starting_snapshot_id, + const PartitionSet& partition_set, const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateNoNewDeleteFiles( + metadata, starting_snapshot_id, partition_set, parent, std::move(io)); + } + static Status ValidateDeletedDataFilesForTest( + const TableMetadata& metadata, int64_t starting_snapshot_id, + std::shared_ptr data_filter, const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateDeletedDataFiles( + metadata, starting_snapshot_id, std::move(data_filter), parent, std::move(io)); + } + static Status ValidateDeletedDataFilesForTest( + const TableMetadata& metadata, int64_t starting_snapshot_id, + const PartitionSet& partition_set, const std::shared_ptr& parent, + std::shared_ptr io) { + return MergingSnapshotUpdate::ValidateDeletedDataFiles( + metadata, starting_snapshot_id, partition_set, parent, std::move(io)); + } bool HasDataFiles() const { return AddsDataFiles(); } bool HasDeleteFiles() const { return AddsDeleteFiles(); } @@ -89,6 +145,34 @@ class TestMergeAppend : public MergingSnapshotUpdate { : MergingSnapshotUpdate(std::move(table_name), std::move(ctx)) {} }; +class TestOverwriteUpdate : public MergingSnapshotUpdate { + public: + static Result> Make( + std::string table_name, std::shared_ptr
table) { + ICEBERG_ASSIGN_OR_RAISE( + auto ctx, TransactionContext::Make(std::move(table), TransactionKind::kUpdate)); + return std::unique_ptr( + new TestOverwriteUpdate(std::move(table_name), std::move(ctx))); + } + + std::string operation() override { return DataOperation::kOverwrite; } + int64_t GeneratedSnapshotId() { return SnapshotId(); } + + Status AddDelete(std::shared_ptr file) { + return AddDeleteFile(std::move(file)); + } + Status AddDelete(std::shared_ptr file, int64_t data_sequence_number) { + return AddDeleteFile(std::move(file), data_sequence_number); + } + Status RemoveDataFile(std::shared_ptr file) { + return DeleteDataFile(std::move(file)); + } + + private: + TestOverwriteUpdate(std::string table_name, std::shared_ptr ctx) + : MergingSnapshotUpdate(std::move(table_name), std::move(ctx)) {} +}; + class MergingSnapshotUpdateTest : public MinimalUpdateTestBase { protected: static void SetUpTestSuite() { avro::RegisterAll(); } @@ -121,10 +205,22 @@ class MergingSnapshotUpdateTest : public MinimalUpdateTestBase { return f; } + std::shared_ptr MakeEqualityDeleteFile(const std::string& path, + int64_t partition_x) { + auto f = MakeDeleteFile(path, partition_x); + f->content = DataFile::Content::kEqualityDeletes; + f->equality_ids = {1}; + return f; + } + Result> NewMergeAppend() { return TestMergeAppend::Make(TableName(), table_); } + Result> NewOverwriteUpdate() { + return TestOverwriteUpdate::Make(TableName(), table_); + } + // Commit file_a_ with FastAppend and refresh the table. void CommitFileA() { ICEBERG_UNWRAP_OR_FAIL(auto fa, table_->NewFastAppend()); @@ -168,6 +264,28 @@ class MergingSnapshotUpdateTest : public MinimalUpdateTestBase { return writer->ToManifestFile(); } + Result> MakeSyntheticSnapshot( + std::string operation, int64_t snapshot_id, + std::optional parent_snapshot_id, int64_t sequence_number, + const std::vector& manifests) { + auto manifest_list_path = + table_location_ + "/metadata/manifest-list-" + std::to_string(snapshot_id) + ".avro"; + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + ManifestListWriter::MakeWriter(table_->metadata()->format_version, snapshot_id, + parent_snapshot_id, manifest_list_path, file_io_, + sequence_number)); + ICEBERG_RETURN_UNEXPECTED(writer->AddAll(manifests)); + ICEBERG_RETURN_UNEXPECTED(writer->Close()); + + ICEBERG_ASSIGN_OR_RAISE( + auto snapshot, + Snapshot::Make(sequence_number, snapshot_id, parent_snapshot_id, TimePointMs{}, + std::move(operation), {}, table_->metadata()->current_schema_id, + manifest_list_path)); + return std::shared_ptr(std::move(snapshot)); + } + std::shared_ptr spec_; std::shared_ptr schema_; std::shared_ptr file_a_; @@ -193,7 +311,7 @@ TEST_F(MergingSnapshotUpdateTest, AddsDataFilesTrueAfterAdd) { } TEST_F(MergingSnapshotUpdateTest, AddsDeleteFilesTrueAfterAdd) { - auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); EXPECT_THAT(op->AddDelete(del_file), IsOk()); EXPECT_FALSE(op->HasDataFiles()); @@ -236,7 +354,7 @@ TEST_F(MergingSnapshotUpdateTest, CommitMultipleDataFiles) { } TEST_F(MergingSnapshotUpdateTest, CommitDataFileAndDeleteFile) { - auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); EXPECT_THAT(op->AddFile(file_a_), IsOk()); @@ -323,6 +441,25 @@ TEST_F(MergingSnapshotUpdateTest, CommitDeleteFileSummaryHasAddedDeleteFiles) { EXPECT_EQ(snapshot->summary.count(SnapshotSummaryFields::kRemovedDeleteFiles), 0); } +TEST_F(MergingSnapshotUpdateTest, AddDeleteFileWithExplicitSequenceWritesSequenceNumber) { + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->AddDelete(del_file, 17), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), nullptr)); + auto delete_manifest_it = + std::find_if(manifests.begin(), manifests.end(), [](const ManifestFile& manifest) { + return manifest.content == ManifestContent::kDeletes; + }); + ASSERT_NE(delete_manifest_it, manifests.end()); + ICEBERG_UNWRAP_OR_FAIL( + auto entries, ReadAllEntries(std::vector{*delete_manifest_it}, + *table_->metadata())); + ASSERT_EQ(entries.size(), 1U); + ASSERT_TRUE(entries[0].sequence_number.has_value()); + EXPECT_EQ(entries[0].sequence_number.value(), 17); +} + // Covers the bug where deleted delete files were not tracked in the snapshot summary. TEST_F(MergingSnapshotUpdateTest, CommitDeletesDeleteFileSummaryHasRemovedDeleteFiles) { // Step 1: commit a delete file. @@ -434,48 +571,60 @@ TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsDeleteManifest) { EXPECT_THAT(op->AppendManifest(del_manifest), IsError(ErrorKind::kInvalidArgument)); } -TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsManifestWithExistingFiles) { - // Construct a ManifestFile that reports existing files without writing to disk. +TEST_F(MergingSnapshotUpdateTest, AddManifestAllowsManifestWithExistingFilesCount) { ManifestFile manifest; manifest.manifest_path = table_location_ + "/metadata/existing.avro"; manifest.content = ManifestContent::kData; manifest.added_snapshot_id = kInvalidSnapshotId; - manifest.existing_files_count = 1; // has_existing_files() returns true + manifest.existing_files_count = 1; ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); - EXPECT_THAT(op->AppendManifest(manifest), IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); } -TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsManifestWithDeletedFiles) { +TEST_F(MergingSnapshotUpdateTest, AddManifestAllowsManifestWithDeletedFilesCount) { ManifestFile manifest; manifest.manifest_path = table_location_ + "/metadata/deleted.avro"; manifest.content = ManifestContent::kData; manifest.added_snapshot_id = kInvalidSnapshotId; - manifest.deleted_files_count = 1; // has_deleted_files() returns true + manifest.deleted_files_count = 1; ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); - EXPECT_THAT(op->AppendManifest(manifest), IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); } -TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsManifestWithAssignedSnapshotId) { - ManifestFile manifest; - manifest.manifest_path = table_location_ + "/metadata/snap.avro"; - manifest.content = ManifestContent::kData; - manifest.added_snapshot_id = 12345; // already assigned +TEST_F(MergingSnapshotUpdateTest, AddManifestCopiesManifestWithAssignedSnapshotId) { + auto path = table_location_ + "/metadata/snap.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto manifest, WriteManifest(path, {file_a_})); + manifest.added_snapshot_id = 12345; ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); - EXPECT_THAT(op->AppendManifest(manifest), IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, + SnapshotCache(snapshot.get()).DataManifests(file_io_)); + ASSERT_EQ(data_manifests.size(), 1U); + EXPECT_NE(data_manifests[0].manifest_path, path); } -TEST_F(MergingSnapshotUpdateTest, AddManifestRejectsManifestWithFirstRowId) { - ManifestFile manifest; - manifest.manifest_path = table_location_ + "/metadata/rowid.avro"; - manifest.content = ManifestContent::kData; - manifest.added_snapshot_id = kInvalidSnapshotId; - manifest.first_row_id = 0; // assigned first_row_id +TEST_F(MergingSnapshotUpdateTest, AddManifestCopiesManifestWithFirstRowId) { + auto path = table_location_ + "/metadata/rowid.avro"; + ICEBERG_UNWRAP_OR_FAIL(auto manifest, WriteManifest(path, {file_a_})); + manifest.first_row_id = 0; ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); - EXPECT_THAT(op->AppendManifest(manifest), IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(op->AppendManifest(manifest), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + ICEBERG_UNWRAP_OR_FAIL(auto data_manifests, + SnapshotCache(snapshot.get()).DataManifests(file_io_)); + ASSERT_EQ(data_manifests.size(), 1U); + EXPECT_NE(data_manifests[0].manifest_path, path); } // ------------------------------------------------------------------------- @@ -715,6 +864,204 @@ TEST_F(MergingSnapshotUpdateTest, SummaryManifestCountsAfterDelete) { EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kManifestsKept), "0"); } +TEST_F(MergingSnapshotUpdateTest, MissingRequestedDeleteDoesNotAffectSummary) { + CommitFileA(); + + auto missing = MakeDataFile("/data/missing.parquet", 3L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewMergeAppend()); + EXPECT_THAT(op->RemoveDataFile(missing), IsOk()); + EXPECT_THAT(op->AddFile(file_b_), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.count(SnapshotSummaryFields::kDeletedDataFiles), 0U); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "2"); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateAddedDataFilesFailsForTruncatedHistory) { + auto metadata = std::make_shared(); + metadata->format_version = 2; + metadata->location = table_location_; + metadata->current_schema_id = 0; + metadata->schemas.push_back(schema_); + + auto make_snapshot = [](int64_t snapshot_id, std::optional parent_snapshot_id) { + return std::make_shared(Snapshot{ + .snapshot_id = snapshot_id, + .parent_snapshot_id = parent_snapshot_id, + .sequence_number = snapshot_id, + .timestamp_ms = TimePointMs{}, + .manifest_list = "", + .summary = {}, + .schema_id = 0, + }); + }; + + auto base_snapshot = make_snapshot(1, std::nullopt); + auto main_snapshot = make_snapshot(2, 1); + auto branch_snapshot = make_snapshot(3, 1); + metadata->snapshots = {base_snapshot, main_snapshot, branch_snapshot}; + + EXPECT_THAT(TestMergeAppend::ValidateAddedDataFilesForTest(*metadata, /*starting=*/2, + branch_snapshot, file_io_), + IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateAddedDataFilesWithPartitionSetDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + ICEBERG_UNWRAP_OR_FAIL(auto fast_append, table_->NewFastAppend()); + fast_append->AppendFile(file_b_); + EXPECT_THAT(fast_append->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto second_snapshot, table_->current_snapshot()); + + PartitionSet partition_set; + ASSERT_TRUE(partition_set.add(spec_->spec_id(), file_b_->partition)); + EXPECT_THAT(TestMergeAppend::ValidateAddedDataFilesForTest( + *table_->metadata(), first_snapshot->snapshot_id, partition_set, + second_snapshot, file_io_), + IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNoNewDeletesForDataFilesWithFilterDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + DataFileSet replaced_files; + replaced_files.insert(file_a_); + EXPECT_THAT(TestMergeAppend::ValidateNoNewDeletesForDataFilesForTest( + *metadata, first_snapshot->snapshot_id, + Expressions::AlwaysTrue(), replaced_files, second_snapshot, file_io_), + IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNoNewDeleteFilesWithExpressionDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + EXPECT_THAT(TestMergeAppend::ValidateNoNewDeleteFilesForTest( + *metadata, first_snapshot->snapshot_id, + Expressions::AlwaysTrue(), second_snapshot, file_io_), + IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateNoNewDeleteFilesWithPartitionSetDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + auto del_file = MakeEqualityDeleteFile("/delete/del_a.parquet", 1L); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->AddDelete(del_file), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + PartitionSet partition_set; + ASSERT_TRUE(partition_set.add(spec_->spec_id(), del_file->partition)); + EXPECT_THAT(TestMergeAppend::ValidateNoNewDeleteFilesForTest( + *metadata, first_snapshot->snapshot_id, partition_set, + second_snapshot, file_io_), + IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateDeletedDataFilesWithExpressionDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + EXPECT_THAT(TestMergeAppend::ValidateDeletedDataFilesForTest( + *metadata, first_snapshot->snapshot_id, + Expressions::AlwaysTrue(), second_snapshot, file_io_), + IsError(ErrorKind::kInvalidArgument)); +} + +TEST_F(MergingSnapshotUpdateTest, ValidateDeletedDataFilesWithPartitionSetDetectsConflict) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwriteUpdate()); + EXPECT_THAT(op->RemoveDataFile(file_a_), IsOk()); + const int64_t second_snapshot_id = op->GeneratedSnapshotId(); + ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), first_snapshot)); + ICEBERG_UNWRAP_OR_FAIL( + auto second_snapshot, + MakeSyntheticSnapshot(DataOperation::kOverwrite, second_snapshot_id, + first_snapshot->snapshot_id, + first_snapshot->sequence_number + 1, manifests)); + + auto metadata = std::make_shared(*table_->metadata()); + metadata->snapshots.push_back(second_snapshot); + metadata->current_snapshot_id = second_snapshot->snapshot_id; + metadata->last_sequence_number = second_snapshot->sequence_number; + + PartitionSet partition_set; + ASSERT_TRUE(partition_set.add(spec_->spec_id(), file_a_->partition)); + EXPECT_THAT(TestMergeAppend::ValidateDeletedDataFilesForTest( + *metadata, first_snapshot->snapshot_id, partition_set, + second_snapshot, file_io_), + IsError(ErrorKind::kInvalidArgument)); +} + // ------------------------------------------------------------------------- // DataSpec — multiple partition specs // ------------------------------------------------------------------------- diff --git a/src/iceberg/test/snapshot_util_test.cc b/src/iceberg/test/snapshot_util_test.cc index a47b403da..83ef09f4b 100644 --- a/src/iceberg/test/snapshot_util_test.cc +++ b/src/iceberg/test/snapshot_util_test.cc @@ -301,10 +301,20 @@ TEST_F(SnapshotUtilTest, SchemaForBranch) { ICEBERG_UNWRAP_OR_FAIL(auto initial_schema, table_->schema()); ASSERT_NE(initial_schema, nullptr); + auto branch_schema = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "id", int32()), + SchemaField::MakeRequired(2, "data", string()), + SchemaField::MakeOptional(3, "branch_only", string())}, + 1); + table_->metadata()->schemas.push_back(branch_schema); + ICEBERG_UNWRAP_OR_FAIL(auto branch_snapshot, table_->SnapshotById(branch_snapshot_id_)); + branch_snapshot->schema_id = branch_schema->schema_id(); + std::string branch = "b1"; ICEBERG_UNWRAP_OR_FAIL(auto schema, SnapshotUtil::SchemaFor(*table_, branch)); - // Branch should return current schema (not snapshot schema) - EXPECT_EQ(schema->fields().size(), initial_schema->fields().size()); + EXPECT_EQ(schema->schema_id(), branch_schema->schema_id()); + EXPECT_EQ(schema->fields().size(), branch_schema->fields().size()); + EXPECT_NE(schema->fields().size(), initial_schema->fields().size()); } TEST_F(SnapshotUtilTest, SchemaForTag) { diff --git a/src/iceberg/update/merging_snapshot_update.cc b/src/iceberg/update/merging_snapshot_update.cc index f2d0c93e0..738dc2dcd 100644 --- a/src/iceberg/update/merging_snapshot_update.cc +++ b/src/iceberg/update/merging_snapshot_update.cc @@ -45,6 +45,110 @@ namespace iceberg { +namespace { + +bool MatchesOperation(std::optional operation, + std::initializer_list expected) { + return operation.has_value() && + std::find(expected.begin(), expected.end(), operation.value()) != + expected.end(); +} + +struct ValidationHistoryResult { + std::vector manifests; + std::unordered_set snapshot_ids; +}; + +Result>> ValidationAncestorsBetween( + const TableMetadata& metadata, int64_t latest_snapshot_id, + int64_t starting_snapshot_id) { + ICEBERG_ASSIGN_OR_RAISE( + auto ancestors, + SnapshotUtil::AncestorsBetween(metadata, latest_snapshot_id, starting_snapshot_id)); + if (latest_snapshot_id == starting_snapshot_id) { + return ancestors; + } + if (ancestors.empty()) { + return InvalidArgument("Cannot validate history: starting snapshot {} is not an ancestor " + "of snapshot {}", + starting_snapshot_id, latest_snapshot_id); + } + + const auto& oldest_checked = ancestors.back(); + if (oldest_checked == nullptr || !oldest_checked->parent_snapshot_id.has_value() || + oldest_checked->parent_snapshot_id.value() != starting_snapshot_id) { + return InvalidArgument("Cannot validate history: starting snapshot {} is not an ancestor " + "of snapshot {}", + starting_snapshot_id, latest_snapshot_id); + } + return ancestors; +} + +Result ValidationHistory( + const TableMetadata& metadata, int64_t latest_snapshot_id, + int64_t starting_snapshot_id, + std::initializer_list matching_operations, + ManifestContent content, const std::shared_ptr& io) { + ICEBERG_ASSIGN_OR_RAISE( + auto ancestors, + ValidationAncestorsBetween(metadata, latest_snapshot_id, starting_snapshot_id)); + + ValidationHistoryResult result; + for (const auto& snapshot : ancestors) { + if (!MatchesOperation(snapshot->Operation(), matching_operations)) { + continue; + } + + result.snapshot_ids.insert(snapshot->snapshot_id); + auto cached = SnapshotCache(snapshot.get()); + ICEBERG_ASSIGN_OR_RAISE( + auto manifests, content == ManifestContent::kData ? cached.DataManifests(io) + : cached.DeleteManifests(io)); + for (const auto& manifest : manifests) { + if (manifest.added_snapshot_id == snapshot->snapshot_id) { + result.manifests.push_back(manifest); + } + } + } + + return result; +} + +Result> FindMatchingDataFile( + const TableMetadata& metadata, const std::vector& manifests, + ManifestStatus status, std::shared_ptr filter, + const PartitionSet* partition_set, const std::shared_ptr& io, + bool case_sensitive) { + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + auto partition_filter = partition_set != nullptr + ? std::make_shared(*partition_set) + : std::shared_ptr{}; + + for (const auto& manifest : manifests) { + ICEBERG_ASSIGN_OR_RAISE(auto spec, + metadata.PartitionSpecById(manifest.partition_spec_id)); + ICEBERG_ASSIGN_OR_RAISE(auto reader, ManifestReader::Make(manifest, io, schema, spec)); + reader->CaseSensitive(case_sensitive); + if (filter != nullptr) { + reader->FilterRows(filter); + } + if (partition_filter != nullptr) { + reader->FilterPartitions(partition_filter); + } + + ICEBERG_ASSIGN_OR_RAISE(auto entries, reader->Entries()); + for (const auto& entry : entries) { + if (entry.status == status && entry.data_file != nullptr) { + return entry.data_file->file_path; + } + } + } + + return std::optional{}; +} + +} // namespace + MergingSnapshotUpdate::MergingSnapshotUpdate(std::string table_name, std::shared_ptr ctx) : SnapshotUpdate(std::move(ctx)), @@ -122,6 +226,16 @@ Status MergingSnapshotUpdate::ValidateNewDeleteFile(const DataFile& file) { } Status MergingSnapshotUpdate::AddDeleteFile(std::shared_ptr file) { + return AddDeleteFile(std::move(file), std::nullopt); +} + +Status MergingSnapshotUpdate::AddDeleteFile(std::shared_ptr file, + int64_t data_sequence_number) { + return AddDeleteFile(std::move(file), std::optional(data_sequence_number)); +} + +Status MergingSnapshotUpdate::AddDeleteFile(std::shared_ptr file, + std::optional data_sequence_number) { if (!file) { return InvalidArgument("Cannot add a null delete file"); } @@ -133,7 +247,9 @@ Status MergingSnapshotUpdate::AddDeleteFile(std::shared_ptr file) { base().PartitionSpecById(file->partition_spec_id.value())); ICEBERG_RETURN_UNEXPECTED(added_delete_files_summary_.AddedFile(*spec, *file)); has_new_delete_files_ = true; - new_delete_files_.push_back(std::move(file)); + new_delete_files_.push_back( + PendingDeleteFile{.file = std::move(file), + .data_sequence_number = std::move(data_sequence_number)}); return {}; } @@ -215,34 +331,12 @@ std::vector> MergingSnapshotUpdate::AddedDataFiles() c return result; } -Status MergingSnapshotUpdate::AddDeleteFile(std::shared_ptr /*file*/, - int64_t /*data_sequence_number*/) { - return NotImplemented( - "AddDeleteFile with explicit data sequence number is not yet implemented"); -} - Status MergingSnapshotUpdate::AddManifest(ManifestFile manifest) { if (manifest.content != ManifestContent::kData) { return InvalidArgument("Cannot append delete manifest: {}", manifest.manifest_path); } - if (manifest.has_existing_files()) { - return InvalidArgument("Cannot append manifest with existing files: {}", - manifest.manifest_path); - } - if (manifest.has_deleted_files()) { - return InvalidArgument("Cannot append manifest with deleted files: {}", - manifest.manifest_path); - } - if (manifest.added_snapshot_id != kInvalidSnapshotId) { - return InvalidArgument("Snapshot id must be assigned during commit: {}", - manifest.manifest_path); - } - if (manifest.first_row_id.has_value()) { - return InvalidArgument("Cannot append manifest with assigned first_row_id: {}", - manifest.manifest_path); - } - - if (can_inherit_snapshot_id()) { + if (can_inherit_snapshot_id() && manifest.added_snapshot_id == kInvalidSnapshotId && + !manifest.first_row_id.has_value()) { appended_manifests_summary_.AddedManifest(manifest); append_manifests_.push_back(std::move(manifest)); } else { @@ -254,7 +348,7 @@ Status MergingSnapshotUpdate::AddManifest(ManifestFile manifest) { Result MergingSnapshotUpdate::CopyManifest(const ManifestFile& manifest) { const TableMetadata& current = base(); - ICEBERG_ASSIGN_OR_RAISE(auto schema, current.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto schema, SnapshotUtil::SchemaFor(current, target_branch())); ICEBERG_ASSIGN_OR_RAISE(auto spec, current.PartitionSpecById(manifest.partition_spec_id)); std::string path = ManifestPath(); @@ -285,17 +379,16 @@ bool MergingSnapshotUpdate::DeletesDeleteFiles() const { // Apply pipeline // ------------------------------------------------------------------------- -ManifestWriterFactory MergingSnapshotUpdate::MakeTrackedWriterFactory() { - return [this](int32_t spec_id, +ManifestWriterFactory MergingSnapshotUpdate::MakeTrackedWriterFactory( + const std::shared_ptr& schema) { + return [this, schema](int32_t spec_id, ManifestContent content) -> Result> { const TableMetadata& meta = base(); - ICEBERG_ASSIGN_OR_RAISE(auto schema, meta.Schema()); ICEBERG_ASSIGN_OR_RAISE(auto spec, meta.PartitionSpecById(spec_id)); std::string path = ManifestPath(); all_written_manifests_.insert(path); return ManifestWriter::MakeWriter(meta.format_version, SnapshotId(), std::move(path), - ctx_->table->io(), std::move(spec), - std::move(schema), content); + ctx_->table->io(), std::move(spec), schema, content); }; } @@ -344,17 +437,25 @@ Result> MergingSnapshotUpdate::WriteNewDeleteManifests } // Group delete files by partition spec ID, mirroring WriteNewDataManifests(). - std::unordered_map>> - delete_files_by_spec; - for (const auto& file : new_delete_files_) { - delete_files_by_spec[file->partition_spec_id.value()].push_back(file); + std::unordered_map> delete_files_by_spec; + for (const auto& pending_file : new_delete_files_) { + delete_files_by_spec[pending_file.file->partition_spec_id.value()].push_back( + pending_file); } std::vector result; - for (const auto& [spec_id, delete_files] : delete_files_by_spec) { + for (auto& [spec_id, delete_files] : delete_files_by_spec) { ICEBERG_ASSIGN_OR_RAISE(auto spec, base().PartitionSpecById(spec_id)); + std::vector delete_entries; + delete_entries.reserve(delete_files.size()); + for (const auto& pending_file : delete_files) { + delete_entries.push_back(DeleteManifestEntry{ + .file = pending_file.file, + .data_sequence_number = pending_file.data_sequence_number, + }); + } ICEBERG_ASSIGN_OR_RAISE(auto written, - WriteDeleteManifests(std::span(delete_files), spec)); + WriteDeleteManifests(delete_entries, spec)); for (const auto& m : written) { all_written_manifests_.insert(m.manifest_path); } @@ -371,8 +472,8 @@ Result> MergingSnapshotUpdate::Apply( const TableMetadata& metadata_to_update, const std::shared_ptr& snapshot) { // Re-validate buffered delete files against the current format version. A format // upgrade between staging and commit could make previously-valid files invalid. - for (const auto& file : new_delete_files_) { - ICEBERG_RETURN_UNEXPECTED(ValidateNewDeleteFile(*file)); + for (const auto& pending_file : new_delete_files_) { + ICEBERG_RETURN_UNEXPECTED(ValidateNewDeleteFile(*pending_file.file)); } // Rebuild summary from stable sub-builders so that commit retries don't double-count. @@ -381,12 +482,14 @@ Result> MergingSnapshotUpdate::Apply( summary_builder().Merge(added_delete_files_summary_); summary_builder().Merge(appended_manifests_summary_); - auto tracked_factory = MakeTrackedWriterFactory(); + ICEBERG_ASSIGN_OR_RAISE(auto target_schema, + SnapshotUtil::SchemaFor(metadata_to_update, target_branch())); + auto tracked_factory = MakeTrackedWriterFactory(target_schema); // Step 1: Filter data manifests. ICEBERG_ASSIGN_OR_RAISE(auto filtered_data, data_filter_manager_.FilterManifests( - metadata_to_update, snapshot, tracked_factory)); + target_schema, metadata_to_update, snapshot, tracked_factory)); // Track deleted data files in the summary builder. for (const auto& file : data_filter_manager_.FilesToBeDeleted()) { @@ -419,7 +522,7 @@ Result> MergingSnapshotUpdate::Apply( // Step 3: Filter delete manifests. ICEBERG_ASSIGN_OR_RAISE(auto filtered_deletes, delete_filter_manager_.FilterManifests( - metadata_to_update, snapshot, tracked_factory)); + target_schema, metadata_to_update, snapshot, tracked_factory)); // Track deleted delete files in the summary builder. for (const auto& file : delete_filter_manager_.FilesToBeDeleted()) { @@ -552,65 +655,19 @@ Status MergingSnapshotUpdate::ValidateAddedDataFiles( return {}; } - ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); - ICEBERG_ASSIGN_OR_RAISE(auto ancestors, - SnapshotUtil::AncestorsBetween(metadata, parent->snapshot_id, - starting_snapshot_id)); - - // Build the full set of matching snapshot IDs first, then scan their manifests. - // The full set must be known before filtering manifests, since a manifest may have - // been written by a different snapshot in the ancestry range. - std::unordered_set matching_snapshot_ids; - for (const auto& snap : ancestors) { - auto op = snap->Operation(); - if (op == DataOperation::kAppend || op == DataOperation::kOverwrite) { - matching_snapshot_ids.insert(snap->snapshot_id); - } - } - - std::unique_ptr evaluator; - if (filter != nullptr) { - ICEBERG_ASSIGN_OR_RAISE( - evaluator, InclusiveMetricsEvaluator::Make(filter, *schema, case_sensitive)); - } - - for (const auto& snapshot : ancestors) { - if (!matching_snapshot_ids.contains(snapshot->snapshot_id)) { - continue; - } - auto cached = SnapshotCache(snapshot.get()); - ICEBERG_ASSIGN_OR_RAISE(auto data_manifests, cached.DataManifests(io)); - - for (const auto& manifest : data_manifests) { - if (!matching_snapshot_ids.contains(manifest.added_snapshot_id)) { - continue; - } - ICEBERG_ASSIGN_OR_RAISE(auto spec, - metadata.PartitionSpecById(manifest.partition_spec_id)); - ICEBERG_ASSIGN_OR_RAISE(auto reader, - ManifestReader::Make(manifest, io, schema, spec)); - ICEBERG_ASSIGN_OR_RAISE(auto entries, reader->Entries()); - - for (const auto& entry : entries) { - if (entry.status != ManifestStatus::kAdded) { - continue; - } - if (entry.data_file == nullptr) { - continue; - } - if (evaluator != nullptr) { - ICEBERG_ASSIGN_OR_RAISE(bool matches, evaluator->Evaluate(*entry.data_file)); - if (!matches) { - continue; - } - } - return InvalidArgument( - "Found conflicting files that can contain rows matching {}:" - " {} in snapshot {}", - filter != nullptr ? filter->ToString() : "any expression", - entry.data_file->file_path, snapshot->snapshot_id); - } - } + ICEBERG_ASSIGN_OR_RAISE( + auto history, + ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + {DataOperation::kAppend, DataOperation::kOverwrite}, + ManifestContent::kData, io)); + ICEBERG_ASSIGN_OR_RAISE(auto conflict_path, + FindMatchingDataFile(metadata, history.manifests, + ManifestStatus::kAdded, filter, nullptr, io, + case_sensitive)); + if (conflict_path.has_value()) { + return InvalidArgument("Found conflicting files that can contain rows matching {}: {}", + filter != nullptr ? filter->ToString() : "any expression", + conflict_path.value()); } return {}; } @@ -626,8 +683,8 @@ Status MergingSnapshotUpdate::ValidateDataFilesExist( ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); ICEBERG_ASSIGN_OR_RAISE(auto ancestors, - SnapshotUtil::AncestorsBetween(metadata, parent->snapshot_id, - starting_snapshot_id)); + ValidationAncestorsBetween(metadata, parent->snapshot_id, + starting_snapshot_id)); // Build the full set of matching snapshot IDs first, then scan their manifests. // The full set must be known before filtering manifests, since a manifest may have @@ -742,51 +799,177 @@ Status MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( } Status MergingSnapshotUpdate::ValidateAddedDataFiles( - const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, - const PartitionSet& /*partition_set*/, const std::shared_ptr& /*parent*/, - std::shared_ptr /*io*/) { - return NotImplemented( - "ValidateAddedDataFiles with PartitionSet is not yet implemented"); + const TableMetadata& metadata, int64_t starting_snapshot_id, + const PartitionSet& partition_set, const std::shared_ptr& parent, + std::shared_ptr io) { + if (parent == nullptr) { + return {}; + } + + ICEBERG_ASSIGN_OR_RAISE( + auto history, + ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + {DataOperation::kAppend, DataOperation::kOverwrite}, + ManifestContent::kData, io)); + ICEBERG_ASSIGN_OR_RAISE( + auto conflict_path, + FindMatchingDataFile(metadata, history.manifests, ManifestStatus::kAdded, nullptr, + &partition_set, io, /*case_sensitive=*/true)); + if (conflict_path.has_value()) { + return InvalidArgument( + "Found conflicting files that can contain rows in validated partitions: {}", + conflict_path.value()); + } + return {}; } Status MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( - const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, - std::shared_ptr /*data_filter*/, const DataFileSet& /*replaced_files*/, - const std::shared_ptr& /*parent*/, std::shared_ptr /*io*/) { - return NotImplemented( - "ValidateNoNewDeletesForDataFiles with data filter is not yet implemented"); + const TableMetadata& metadata, int64_t starting_snapshot_id, + std::shared_ptr data_filter, const DataFileSet& replaced_files, + const std::shared_ptr& parent, std::shared_ptr io) { + if (parent == nullptr || replaced_files.empty() || metadata.format_version < 2) { + return {}; + } + + ICEBERG_ASSIGN_OR_RAISE( + auto deletes, + AddedDeleteFiles(metadata, starting_snapshot_id, nullptr, nullptr, parent, io)); + if (deletes->empty()) { + return {}; + } + + int64_t starting_seq = TableMetadata::kInitialSequenceNumber; + if (auto snap_result = metadata.SnapshotById(starting_snapshot_id); + snap_result.has_value()) { + starting_seq = snap_result.value()->sequence_number; + } + + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + std::unique_ptr evaluator; + if (data_filter != nullptr) { + ICEBERG_ASSIGN_OR_RAISE( + evaluator, InclusiveMetricsEvaluator::Make(data_filter, *schema, + /*case_sensitive=*/true)); + } + + for (const auto& data_file : replaced_files) { + ICEBERG_ASSIGN_OR_RAISE(auto delete_files, + deletes->ForDataFile(starting_seq, *data_file)); + for (const auto& delete_file : delete_files) { + if (evaluator != nullptr) { + ICEBERG_ASSIGN_OR_RAISE(bool matches, evaluator->Evaluate(*delete_file)); + if (!matches) { + continue; + } + } + return InvalidArgument("Cannot commit, found new delete for replaced data file: {}", + data_file->file_path); + } + } + return {}; } Status MergingSnapshotUpdate::ValidateNoNewDeleteFiles( - const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, - std::shared_ptr /*data_filter*/, - const std::shared_ptr& /*parent*/, std::shared_ptr /*io*/) { - return NotImplemented( - "ValidateNoNewDeleteFiles with Expression is not yet implemented"); + const TableMetadata& metadata, int64_t starting_snapshot_id, + std::shared_ptr data_filter, + const std::shared_ptr& parent, std::shared_ptr io) { + ICEBERG_ASSIGN_OR_RAISE( + auto deletes, + AddedDeleteFiles(metadata, starting_snapshot_id, nullptr, nullptr, parent, io)); + auto referenced_delete_files = deletes->ReferencedDeleteFiles(); + + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + std::unique_ptr evaluator; + if (data_filter != nullptr) { + ICEBERG_ASSIGN_OR_RAISE( + evaluator, InclusiveMetricsEvaluator::Make(data_filter, *schema, + /*case_sensitive=*/true)); + } + + for (const auto& delete_file : referenced_delete_files) { + if (evaluator != nullptr) { + ICEBERG_ASSIGN_OR_RAISE(bool matches, evaluator->Evaluate(*delete_file)); + if (!matches) { + continue; + } + } + return InvalidArgument("Found new conflicting delete files: {}", + delete_file->file_path); + } + return {}; } Status MergingSnapshotUpdate::ValidateNoNewDeleteFiles( - const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, - const PartitionSet& /*partition_set*/, const std::shared_ptr& /*parent*/, - std::shared_ptr /*io*/) { - return NotImplemented( - "ValidateNoNewDeleteFiles with PartitionSet is not yet implemented"); + const TableMetadata& metadata, int64_t starting_snapshot_id, + const PartitionSet& partition_set, const std::shared_ptr& parent, + std::shared_ptr io) { + ICEBERG_ASSIGN_OR_RAISE( + auto deletes, + AddedDeleteFiles(metadata, starting_snapshot_id, nullptr, nullptr, parent, io)); + auto referenced_delete_files = deletes->ReferencedDeleteFiles(); + for (const auto& delete_file : referenced_delete_files) { + if (!delete_file->partition_spec_id.has_value() || + !partition_set.contains(delete_file->partition_spec_id.value(), + delete_file->partition)) { + continue; + } + return InvalidArgument("Found new conflicting delete files in validated partitions: {}", + delete_file->file_path); + } + return {}; } Status MergingSnapshotUpdate::ValidateDeletedDataFiles( - const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, - std::shared_ptr /*data_filter*/, - const std::shared_ptr& /*parent*/, std::shared_ptr /*io*/) { - return NotImplemented( - "ValidateDeletedDataFiles with Expression is not yet implemented"); + const TableMetadata& metadata, int64_t starting_snapshot_id, + std::shared_ptr data_filter, + const std::shared_ptr& parent, std::shared_ptr io) { + if (parent == nullptr) { + return {}; + } + + ICEBERG_ASSIGN_OR_RAISE( + auto history, + ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + {DataOperation::kOverwrite, DataOperation::kReplace, + DataOperation::kDelete}, + ManifestContent::kData, io)); + ICEBERG_ASSIGN_OR_RAISE(auto conflict_path, + FindMatchingDataFile(metadata, history.manifests, + ManifestStatus::kDeleted, data_filter, + nullptr, io, /*case_sensitive=*/true)); + if (conflict_path.has_value()) { + return InvalidArgument( + "Found conflicting deleted files that can contain rows matching {}: {}", + data_filter != nullptr ? data_filter->ToString() : "any expression", + conflict_path.value()); + } + return {}; } Status MergingSnapshotUpdate::ValidateDeletedDataFiles( - const TableMetadata& /*metadata*/, int64_t /*starting_snapshot_id*/, - const PartitionSet& /*partition_set*/, const std::shared_ptr& /*parent*/, - std::shared_ptr /*io*/) { - return NotImplemented( - "ValidateDeletedDataFiles with PartitionSet is not yet implemented"); + const TableMetadata& metadata, int64_t starting_snapshot_id, + const PartitionSet& partition_set, const std::shared_ptr& parent, + std::shared_ptr io) { + if (parent == nullptr) { + return {}; + } + + ICEBERG_ASSIGN_OR_RAISE( + auto history, + ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + {DataOperation::kOverwrite, DataOperation::kReplace, + DataOperation::kDelete}, + ManifestContent::kData, io)); + ICEBERG_ASSIGN_OR_RAISE( + auto conflict_path, + FindMatchingDataFile(metadata, history.manifests, ManifestStatus::kDeleted, nullptr, + &partition_set, io, /*case_sensitive=*/true)); + if (conflict_path.has_value()) { + return InvalidArgument( + "Found conflicting deleted files in validated partitions: {}", + conflict_path.value()); + } + return {}; } Result> MergingSnapshotUpdate::AddedDeleteFiles( @@ -806,32 +989,11 @@ Result> MergingSnapshotUpdate::AddedDeleteFiles return builder.Build(); } - ICEBERG_ASSIGN_OR_RAISE(auto ancestors, - SnapshotUtil::AncestorsBetween(metadata, parent->snapshot_id, - starting_snapshot_id)); - - // Collect delete manifests from OVERWRITE and DELETE snapshots only. - std::unordered_set matching_snapshot_ids; - for (const auto& snap : ancestors) { - auto op = snap->Operation(); - if (op == DataOperation::kOverwrite || op == DataOperation::kDelete) { - matching_snapshot_ids.insert(snap->snapshot_id); - } - } - - std::vector delete_manifests; - for (const auto& snapshot : ancestors) { - if (!matching_snapshot_ids.contains(snapshot->snapshot_id)) { - continue; - } - auto cached = SnapshotCache(snapshot.get()); - ICEBERG_ASSIGN_OR_RAISE(auto manifests, cached.DeleteManifests(io)); - for (const auto& m : manifests) { - if (matching_snapshot_ids.contains(m.added_snapshot_id)) { - delete_manifests.push_back(m); - } - } - } + ICEBERG_ASSIGN_OR_RAISE( + auto history, + ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + {DataOperation::kOverwrite, DataOperation::kDelete}, + ManifestContent::kDeletes, io)); // Compute the starting sequence number from the starting snapshot. int64_t starting_seq = TableMetadata::kInitialSequenceNumber; @@ -847,7 +1009,7 @@ Result> MergingSnapshotUpdate::AddedDeleteFiles ICEBERG_ASSIGN_OR_RAISE(auto builder, DeleteFileIndex::BuilderFor(io, schema, std::move(specs_by_id), - std::move(delete_manifests))); + std::move(history.manifests))); builder.AfterSequenceNumber(starting_seq); builder.CaseSensitive(case_sensitive); if (data_filter != nullptr) { diff --git a/src/iceberg/update/merging_snapshot_update.h b/src/iceberg/update/merging_snapshot_update.h index a4030bb4d..54e259c23 100644 --- a/src/iceberg/update/merging_snapshot_update.h +++ b/src/iceberg/update/merging_snapshot_update.h @@ -92,15 +92,13 @@ class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { /// \brief Stage a delete file with an explicit data sequence number. /// - /// \note Not yet implemented; returns NotImplemented error. Status AddDeleteFile(std::shared_ptr file, int64_t data_sequence_number); /// \brief Add all files in a pre-existing data manifest to the new snapshot. /// - /// The manifest must contain only DATA content and only ADDED entries (no - /// existing or deleted files). If snapshot ID inheritance is enabled and the - /// manifest has no snapshot ID assigned, it is used directly; otherwise it is - /// copied with the current snapshot ID. + /// The manifest must contain DATA content. If snapshot ID inheritance is + /// enabled and the manifest has no snapshot ID assigned, it is used directly; + /// otherwise it is copied with the current snapshot ID. Status AddManifest(ManifestFile manifest); /// \brief Register a data file (by object) to be deleted from the table. @@ -177,7 +175,6 @@ class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] /// added a data file in any partition of the given partition set. /// - /// \note Not yet implemented; returns NotImplemented error. static Status ValidateAddedDataFiles(const TableMetadata& metadata, int64_t starting_snapshot_id, const PartitionSet& partition_set, @@ -228,7 +225,6 @@ class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] /// added a delete file matching the data filter that covers a file in replaced_files. /// - /// \note Not yet implemented; returns NotImplemented error. static Status ValidateNoNewDeletesForDataFiles(const TableMetadata& metadata, int64_t starting_snapshot_id, std::shared_ptr data_filter, @@ -239,7 +235,6 @@ class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] /// added a delete file matching the given row filter. /// - /// \note Not yet implemented; returns NotImplemented error. static Status ValidateNoNewDeleteFiles(const TableMetadata& metadata, int64_t starting_snapshot_id, std::shared_ptr data_filter, @@ -249,7 +244,6 @@ class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] /// added a delete file matching any partition in the given partition set. /// - /// \note Not yet implemented; returns NotImplemented error. static Status ValidateNoNewDeleteFiles(const TableMetadata& metadata, int64_t starting_snapshot_id, const PartitionSet& partition_set, @@ -259,7 +253,6 @@ class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] /// deleted a data file matching the given row filter. /// - /// \note Not yet implemented; returns NotImplemented error. static Status ValidateDeletedDataFiles(const TableMetadata& metadata, int64_t starting_snapshot_id, std::shared_ptr data_filter, @@ -269,7 +262,6 @@ class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { /// \brief Return an error if any snapshot in [starting_snapshot_id+1, parent] /// deleted a data file in any partition of the given partition set. /// - /// \note Not yet implemented; returns NotImplemented error. static Status ValidateDeletedDataFiles(const TableMetadata& metadata, int64_t starting_snapshot_id, const PartitionSet& partition_set, @@ -295,14 +287,22 @@ class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { std::shared_ptr io); private: + struct PendingDeleteFile { + std::shared_ptr file; + std::optional data_sequence_number; + }; + /// \brief Create a ManifestWriterFactory that records every path it creates in /// all_written_manifests_. - ManifestWriterFactory MakeTrackedWriterFactory(); + ManifestWriterFactory MakeTrackedWriterFactory(const std::shared_ptr& schema); /// \brief Copy a manifest with the current snapshot ID, for use when snapshot /// ID inheritance is not possible. Result CopyManifest(const ManifestFile& manifest); + Status AddDeleteFile(std::shared_ptr file, + std::optional data_sequence_number); + /// \brief Write new data manifests for staged data files; caches the result. Result> WriteNewDataManifests(); @@ -326,7 +326,7 @@ class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { ManifestMergeManager delete_merge_manager_; std::unordered_map new_data_files_by_spec_; - std::vector> new_delete_files_; + std::vector new_delete_files_; std::optional new_data_files_data_seq_number_; // Manifests passed via AddManifest(): inherit path (no copy needed) and diff --git a/src/iceberg/update/snapshot_update.cc b/src/iceberg/update/snapshot_update.cc index fc5359e42..bff12c912 100644 --- a/src/iceberg/update/snapshot_update.cc +++ b/src/iceberg/update/snapshot_update.cc @@ -194,6 +194,19 @@ Result> SnapshotUpdate::WriteDataManifests( Result> SnapshotUpdate::WriteDeleteManifests( std::span> files, const std::shared_ptr& spec) { + std::vector delete_entries; + delete_entries.reserve(files.size()); + for (const auto& file : files) { + delete_entries.push_back( + DeleteManifestEntry{.file = file, .data_sequence_number = std::nullopt}); + } + return WriteDeleteManifests(delete_entries, spec); +} + +// TODO(xxx): write manifests in parallel +Result> SnapshotUpdate::WriteDeleteManifests( + std::span files, + const std::shared_ptr& spec) { if (files.empty()) { return std::vector{}; } @@ -208,10 +221,9 @@ Result> SnapshotUpdate::WriteDeleteManifests( }, target_manifest_size_bytes_); - for (const auto& file : files) { - // FIXME: Java impl wrap it with `PendingDeleteFile` and deals with - // file->data_sequence_number - ICEBERG_RETURN_UNEXPECTED(rolling_writer.WriteAddedEntry(file)); + for (const auto& entry : files) { + ICEBERG_RETURN_UNEXPECTED( + rolling_writer.WriteAddedEntry(entry.file, entry.data_sequence_number)); } ICEBERG_RETURN_UNEXPECTED(rolling_writer.Close()); return rolling_writer.ToManifestFiles(); diff --git a/src/iceberg/update/snapshot_update.h b/src/iceberg/update/snapshot_update.h index f48e5f44d..42df70d61 100644 --- a/src/iceberg/update/snapshot_update.h +++ b/src/iceberg/update/snapshot_update.h @@ -122,6 +122,11 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { Status Finalize(Result commit_result) override; protected: + struct DeleteManifestEntry { + std::shared_ptr file; + std::optional data_sequence_number; + }; + explicit SnapshotUpdate(std::shared_ptr ctx); /// \brief Write data manifests for the given data files @@ -144,6 +149,10 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { std::span> files, const std::shared_ptr& spec); + Result> WriteDeleteManifests( + std::span files, + const std::shared_ptr& spec); + const std::string& target_branch() const { return target_branch_; } bool can_inherit_snapshot_id() const { return can_inherit_snapshot_id_; } const std::string& commit_uuid() const { return commit_uuid_; } diff --git a/src/iceberg/util/snapshot_util.cc b/src/iceberg/util/snapshot_util.cc index 49019408b..1bbd6ae8c 100644 --- a/src/iceberg/util/snapshot_util.cc +++ b/src/iceberg/util/snapshot_util.cc @@ -375,7 +375,7 @@ Result> SnapshotUtil::SchemaFor(const Table& table, const auto& metadata = table.metadata(); auto it = metadata->refs.find(ref); - if (it == metadata->refs.cend() || it->second->type() == SnapshotRefType::kBranch) { + if (it == metadata->refs.cend()) { return table.schema(); } @@ -389,7 +389,7 @@ Result> SnapshotUtil::SchemaFor(const TableMetadata& met } auto it = metadata.refs.find(ref); - if (it == metadata.refs.end() || it->second->type() == SnapshotRefType::kBranch) { + if (it == metadata.refs.end()) { return metadata.Schema(); } diff --git a/src/iceberg/util/snapshot_util_internal.h b/src/iceberg/util/snapshot_util_internal.h index 8a3158185..66a99a3b4 100644 --- a/src/iceberg/util/snapshot_util_internal.h +++ b/src/iceberg/util/snapshot_util_internal.h @@ -306,9 +306,9 @@ class ICEBERG_EXPORT SnapshotUtil { /// \brief Return the schema of the snapshot at a given ref. /// - /// If the ref does not exist or the ref is a branch, the table schema is returned - /// because it will be the schema when the new branch is created. If the ref is a tag, - /// then the snapshot schema is returned. + /// If the ref does not exist, the current table schema is returned. If the ref exists + /// and points to a snapshot (branch or tag), the schema recorded for that snapshot is + /// returned. /// /// \param table The table /// \param ref Ref name of the table (empty string means main branch) @@ -318,9 +318,9 @@ class ICEBERG_EXPORT SnapshotUtil { /// \brief Return the schema of the snapshot at a given ref. /// - /// If the ref does not exist or the ref is a branch, the table schema is returned - /// because it will be the schema when the new branch is created. If the ref is a tag, - /// then the snapshot schema is returned. + /// If the ref does not exist, the current table schema is returned. If the ref exists + /// and points to a snapshot (branch or tag), the schema recorded for that snapshot is + /// returned. /// /// \param metadata The table metadata /// \param ref Ref name of the table (empty string means main branch) From fab0b7b534c0386c9a9171a592728d80853b598f Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Wed, 27 May 2026 15:04:39 +0800 Subject: [PATCH 3/4] Apply clang-format fixes Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../manifest/manifest_merge_manager.cc | 4 +- .../test/manifest_filter_manager_test.cc | 8 +- .../test/manifest_merge_manager_test.cc | 12 +- .../test/merging_snapshot_update_test.cc | 122 ++++++------ src/iceberg/update/merging_snapshot_update.cc | 182 +++++++++--------- 5 files changed, 165 insertions(+), 163 deletions(-) diff --git a/src/iceberg/manifest/manifest_merge_manager.cc b/src/iceberg/manifest/manifest_merge_manager.cc index e29a96846..b924450cf 100644 --- a/src/iceberg/manifest/manifest_merge_manager.cc +++ b/src/iceberg/manifest/manifest_merge_manager.cc @@ -143,8 +143,8 @@ Result> ManifestMergeManager::MergeGroup( ICEBERG_ASSIGN_OR_RAISE( auto merged, FlushBin(bin, snapshot_id, metadata, file_io, writer_factory)); if (bin.size() > 1) { - replaced_manifests_count_ += static_cast(std::ranges::count_if( - bin, [snapshot_id](const ManifestFile* manifest) { + replaced_manifests_count_ += static_cast( + std::ranges::count_if(bin, [snapshot_id](const ManifestFile* manifest) { return manifest->added_snapshot_id != snapshot_id; })); } diff --git a/src/iceberg/test/manifest_filter_manager_test.cc b/src/iceberg/test/manifest_filter_manager_test.cc index 7dc4ee2d9..a49445ba3 100644 --- a/src/iceberg/test/manifest_filter_manager_test.cc +++ b/src/iceberg/test/manifest_filter_manager_test.cc @@ -475,10 +475,10 @@ TEST_F(ManifestFilterManagerTest, DropDeleteFilesOlderThanDuringDeleteManifestRe auto manifest_path = std::format("{}/metadata/del-manifest-{}.avro", table_location_, manifest_counter_++); - ICEBERG_UNWRAP_OR_FAIL(auto del_manifest, - WriteDeleteManifest( - {{old_file, 2L}, {targeted_file, 10L}, {keep_file, 10L}}, - file_io_, *metadata, manifest_path)); + ICEBERG_UNWRAP_OR_FAIL( + auto del_manifest, + WriteDeleteManifest({{old_file, 2L}, {targeted_file, 10L}, {keep_file, 10L}}, + file_io_, *metadata, manifest_path)); ManifestFilterManager mgr(ManifestContent::kDeletes, file_io_); mgr.DeleteFile(targeted_file->file_path); diff --git a/src/iceberg/test/manifest_merge_manager_test.cc b/src/iceberg/test/manifest_merge_manager_test.cc index b5dd60f9e..d06ff0842 100644 --- a/src/iceberg/test/manifest_merge_manager_test.cc +++ b/src/iceberg/test/manifest_merge_manager_test.cc @@ -199,9 +199,9 @@ TEST_F(ManifestMergeManagerTest, ReplacedManifestCountTracksPreviousSnapshotInpu m1.added_snapshot_id = kSnapshotId - 2; ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/2, /*enabled=*/true); - ICEBERG_UNWRAP_OR_FAIL(auto result, - mgr.MergeManifests({m0, m1}, {}, kSnapshotId, *metadata_, file_io_, - MakeWriterFactory())); + ICEBERG_UNWRAP_OR_FAIL( + auto result, mgr.MergeManifests({m0, m1}, {}, kSnapshotId, *metadata_, file_io_, + MakeWriterFactory())); EXPECT_EQ(result.size(), 1U); EXPECT_EQ(mgr.ReplacedManifestsCount(), 2); @@ -212,9 +212,9 @@ TEST_F(ManifestMergeManagerTest, ReplacedManifestCountIgnoresCurrentSnapshotInpu ICEBERG_UNWRAP_OR_FAIL(auto m1, WriteManifest(kSpecId0, 1, /*size=*/100)); ManifestMergeManager mgr(/*target=*/1024, /*min_count=*/2, /*enabled=*/true); - ICEBERG_UNWRAP_OR_FAIL(auto result, - mgr.MergeManifests({}, {m0, m1}, kSnapshotId, *metadata_, file_io_, - MakeWriterFactory())); + ICEBERG_UNWRAP_OR_FAIL( + auto result, mgr.MergeManifests({}, {m0, m1}, kSnapshotId, *metadata_, file_io_, + MakeWriterFactory())); EXPECT_EQ(result.size(), 1U); EXPECT_EQ(mgr.ReplacedManifestsCount(), 0); diff --git a/src/iceberg/test/merging_snapshot_update_test.cc b/src/iceberg/test/merging_snapshot_update_test.cc index d436b7c53..a8317ddc3 100644 --- a/src/iceberg/test/merging_snapshot_update_test.cc +++ b/src/iceberg/test/merging_snapshot_update_test.cc @@ -107,31 +107,35 @@ class TestMergeAppend : public MergingSnapshotUpdate { metadata, starting_snapshot_id, std::move(data_filter), replaced_files, parent, std::move(io)); } - static Status ValidateNoNewDeleteFilesForTest( - const TableMetadata& metadata, int64_t starting_snapshot_id, - std::shared_ptr data_filter, const std::shared_ptr& parent, - std::shared_ptr io) { + static Status ValidateNoNewDeleteFilesForTest(const TableMetadata& metadata, + int64_t starting_snapshot_id, + std::shared_ptr data_filter, + const std::shared_ptr& parent, + std::shared_ptr io) { return MergingSnapshotUpdate::ValidateNoNewDeleteFiles( metadata, starting_snapshot_id, std::move(data_filter), parent, std::move(io)); } - static Status ValidateNoNewDeleteFilesForTest( - const TableMetadata& metadata, int64_t starting_snapshot_id, - const PartitionSet& partition_set, const std::shared_ptr& parent, - std::shared_ptr io) { + static Status ValidateNoNewDeleteFilesForTest(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io) { return MergingSnapshotUpdate::ValidateNoNewDeleteFiles( metadata, starting_snapshot_id, partition_set, parent, std::move(io)); } - static Status ValidateDeletedDataFilesForTest( - const TableMetadata& metadata, int64_t starting_snapshot_id, - std::shared_ptr data_filter, const std::shared_ptr& parent, - std::shared_ptr io) { + static Status ValidateDeletedDataFilesForTest(const TableMetadata& metadata, + int64_t starting_snapshot_id, + std::shared_ptr data_filter, + const std::shared_ptr& parent, + std::shared_ptr io) { return MergingSnapshotUpdate::ValidateDeletedDataFiles( metadata, starting_snapshot_id, std::move(data_filter), parent, std::move(io)); } - static Status ValidateDeletedDataFilesForTest( - const TableMetadata& metadata, int64_t starting_snapshot_id, - const PartitionSet& partition_set, const std::shared_ptr& parent, - std::shared_ptr io) { + static Status ValidateDeletedDataFilesForTest(const TableMetadata& metadata, + int64_t starting_snapshot_id, + const PartitionSet& partition_set, + const std::shared_ptr& parent, + std::shared_ptr io) { return MergingSnapshotUpdate::ValidateDeletedDataFiles( metadata, starting_snapshot_id, partition_set, parent, std::move(io)); } @@ -147,30 +151,30 @@ class TestMergeAppend : public MergingSnapshotUpdate { class TestOverwriteUpdate : public MergingSnapshotUpdate { public: - static Result> Make( - std::string table_name, std::shared_ptr
table) { - ICEBERG_ASSIGN_OR_RAISE( - auto ctx, TransactionContext::Make(std::move(table), TransactionKind::kUpdate)); - return std::unique_ptr( - new TestOverwriteUpdate(std::move(table_name), std::move(ctx))); - } - - std::string operation() override { return DataOperation::kOverwrite; } - int64_t GeneratedSnapshotId() { return SnapshotId(); } - - Status AddDelete(std::shared_ptr file) { - return AddDeleteFile(std::move(file)); - } - Status AddDelete(std::shared_ptr file, int64_t data_sequence_number) { - return AddDeleteFile(std::move(file), data_sequence_number); - } - Status RemoveDataFile(std::shared_ptr file) { - return DeleteDataFile(std::move(file)); - } + static Result> Make(std::string table_name, + std::shared_ptr
table) { + ICEBERG_ASSIGN_OR_RAISE( + auto ctx, TransactionContext::Make(std::move(table), TransactionKind::kUpdate)); + return std::unique_ptr( + new TestOverwriteUpdate(std::move(table_name), std::move(ctx))); + } + + std::string operation() override { return DataOperation::kOverwrite; } + int64_t GeneratedSnapshotId() { return SnapshotId(); } + + Status AddDelete(std::shared_ptr file) { + return AddDeleteFile(std::move(file)); + } + Status AddDelete(std::shared_ptr file, int64_t data_sequence_number) { + return AddDeleteFile(std::move(file), data_sequence_number); + } + Status RemoveDataFile(std::shared_ptr file) { + return DeleteDataFile(std::move(file)); + } private: - TestOverwriteUpdate(std::string table_name, std::shared_ptr ctx) - : MergingSnapshotUpdate(std::move(table_name), std::move(ctx)) {} + TestOverwriteUpdate(std::string table_name, std::shared_ptr ctx) + : MergingSnapshotUpdate(std::move(table_name), std::move(ctx)) {} }; class MergingSnapshotUpdateTest : public MinimalUpdateTestBase { @@ -268,8 +272,8 @@ class MergingSnapshotUpdateTest : public MinimalUpdateTestBase { std::string operation, int64_t snapshot_id, std::optional parent_snapshot_id, int64_t sequence_number, const std::vector& manifests) { - auto manifest_list_path = - table_location_ + "/metadata/manifest-list-" + std::to_string(snapshot_id) + ".avro"; + auto manifest_list_path = table_location_ + "/metadata/manifest-list-" + + std::to_string(snapshot_id) + ".avro"; ICEBERG_ASSIGN_OR_RAISE( auto writer, ManifestListWriter::MakeWriter(table_->metadata()->format_version, snapshot_id, @@ -452,9 +456,9 @@ TEST_F(MergingSnapshotUpdateTest, AddDeleteFileWithExplicitSequenceWritesSequenc return manifest.content == ManifestContent::kDeletes; }); ASSERT_NE(delete_manifest_it, manifests.end()); - ICEBERG_UNWRAP_OR_FAIL( - auto entries, ReadAllEntries(std::vector{*delete_manifest_it}, - *table_->metadata())); + ICEBERG_UNWRAP_OR_FAIL(auto entries, + ReadAllEntries(std::vector{*delete_manifest_it}, + *table_->metadata())); ASSERT_EQ(entries.size(), 1U); ASSERT_TRUE(entries[0].sequence_number.has_value()); EXPECT_EQ(entries[0].sequence_number.value(), 17); @@ -888,7 +892,8 @@ TEST_F(MergingSnapshotUpdateTest, ValidateAddedDataFilesFailsForTruncatedHistory metadata->current_schema_id = 0; metadata->schemas.push_back(schema_); - auto make_snapshot = [](int64_t snapshot_id, std::optional parent_snapshot_id) { + auto make_snapshot = [](int64_t snapshot_id, + std::optional parent_snapshot_id) { return std::make_shared(Snapshot{ .snapshot_id = snapshot_id, .parent_snapshot_id = parent_snapshot_id, @@ -928,7 +933,8 @@ TEST_F(MergingSnapshotUpdateTest, ValidateAddedDataFilesWithPartitionSetDetectsC IsError(ErrorKind::kInvalidArgument)); } -TEST_F(MergingSnapshotUpdateTest, ValidateNoNewDeletesForDataFilesWithFilterDetectsConflict) { +TEST_F(MergingSnapshotUpdateTest, + ValidateNoNewDeletesForDataFilesWithFilterDetectsConflict) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); @@ -951,8 +957,8 @@ TEST_F(MergingSnapshotUpdateTest, ValidateNoNewDeletesForDataFilesWithFilterDete DataFileSet replaced_files; replaced_files.insert(file_a_); EXPECT_THAT(TestMergeAppend::ValidateNoNewDeletesForDataFilesForTest( - *metadata, first_snapshot->snapshot_id, - Expressions::AlwaysTrue(), replaced_files, second_snapshot, file_io_), + *metadata, first_snapshot->snapshot_id, Expressions::AlwaysTrue(), + replaced_files, second_snapshot, file_io_), IsError(ErrorKind::kInvalidArgument)); } @@ -977,12 +983,13 @@ TEST_F(MergingSnapshotUpdateTest, ValidateNoNewDeleteFilesWithExpressionDetectsC metadata->last_sequence_number = second_snapshot->sequence_number; EXPECT_THAT(TestMergeAppend::ValidateNoNewDeleteFilesForTest( - *metadata, first_snapshot->snapshot_id, - Expressions::AlwaysTrue(), second_snapshot, file_io_), + *metadata, first_snapshot->snapshot_id, Expressions::AlwaysTrue(), + second_snapshot, file_io_), IsError(ErrorKind::kInvalidArgument)); } -TEST_F(MergingSnapshotUpdateTest, ValidateNoNewDeleteFilesWithPartitionSetDetectsConflict) { +TEST_F(MergingSnapshotUpdateTest, + ValidateNoNewDeleteFilesWithPartitionSetDetectsConflict) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); @@ -1005,8 +1012,8 @@ TEST_F(MergingSnapshotUpdateTest, ValidateNoNewDeleteFilesWithPartitionSetDetect PartitionSet partition_set; ASSERT_TRUE(partition_set.add(spec_->spec_id(), del_file->partition)); EXPECT_THAT(TestMergeAppend::ValidateNoNewDeleteFilesForTest( - *metadata, first_snapshot->snapshot_id, partition_set, - second_snapshot, file_io_), + *metadata, first_snapshot->snapshot_id, partition_set, second_snapshot, + file_io_), IsError(ErrorKind::kInvalidArgument)); } @@ -1030,12 +1037,13 @@ TEST_F(MergingSnapshotUpdateTest, ValidateDeletedDataFilesWithExpressionDetectsC metadata->last_sequence_number = second_snapshot->sequence_number; EXPECT_THAT(TestMergeAppend::ValidateDeletedDataFilesForTest( - *metadata, first_snapshot->snapshot_id, - Expressions::AlwaysTrue(), second_snapshot, file_io_), + *metadata, first_snapshot->snapshot_id, Expressions::AlwaysTrue(), + second_snapshot, file_io_), IsError(ErrorKind::kInvalidArgument)); } -TEST_F(MergingSnapshotUpdateTest, ValidateDeletedDataFilesWithPartitionSetDetectsConflict) { +TEST_F(MergingSnapshotUpdateTest, + ValidateDeletedDataFilesWithPartitionSetDetectsConflict) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); @@ -1057,8 +1065,8 @@ TEST_F(MergingSnapshotUpdateTest, ValidateDeletedDataFilesWithPartitionSetDetect PartitionSet partition_set; ASSERT_TRUE(partition_set.add(spec_->spec_id(), file_a_->partition)); EXPECT_THAT(TestMergeAppend::ValidateDeletedDataFilesForTest( - *metadata, first_snapshot->snapshot_id, partition_set, - second_snapshot, file_io_), + *metadata, first_snapshot->snapshot_id, partition_set, second_snapshot, + file_io_), IsError(ErrorKind::kInvalidArgument)); } diff --git a/src/iceberg/update/merging_snapshot_update.cc b/src/iceberg/update/merging_snapshot_update.cc index 738dc2dcd..93b169682 100644 --- a/src/iceberg/update/merging_snapshot_update.cc +++ b/src/iceberg/update/merging_snapshot_update.cc @@ -50,8 +50,7 @@ namespace { bool MatchesOperation(std::optional operation, std::initializer_list expected) { return operation.has_value() && - std::find(expected.begin(), expected.end(), operation.value()) != - expected.end(); + std::find(expected.begin(), expected.end(), operation.value()) != expected.end(); } struct ValidationHistoryResult { @@ -69,17 +68,19 @@ Result>> ValidationAncestorsBetween( return ancestors; } if (ancestors.empty()) { - return InvalidArgument("Cannot validate history: starting snapshot {} is not an ancestor " - "of snapshot {}", - starting_snapshot_id, latest_snapshot_id); + return InvalidArgument( + "Cannot validate history: starting snapshot {} is not an ancestor " + "of snapshot {}", + starting_snapshot_id, latest_snapshot_id); } const auto& oldest_checked = ancestors.back(); if (oldest_checked == nullptr || !oldest_checked->parent_snapshot_id.has_value() || oldest_checked->parent_snapshot_id.value() != starting_snapshot_id) { - return InvalidArgument("Cannot validate history: starting snapshot {} is not an ancestor " - "of snapshot {}", - starting_snapshot_id, latest_snapshot_id); + return InvalidArgument( + "Cannot validate history: starting snapshot {} is not an ancestor " + "of snapshot {}", + starting_snapshot_id, latest_snapshot_id); } return ancestors; } @@ -87,8 +88,8 @@ Result>> ValidationAncestorsBetween( Result ValidationHistory( const TableMetadata& metadata, int64_t latest_snapshot_id, int64_t starting_snapshot_id, - std::initializer_list matching_operations, - ManifestContent content, const std::shared_ptr& io) { + std::initializer_list matching_operations, ManifestContent content, + const std::shared_ptr& io) { ICEBERG_ASSIGN_OR_RAISE( auto ancestors, ValidationAncestorsBetween(metadata, latest_snapshot_id, starting_snapshot_id)); @@ -101,9 +102,9 @@ Result ValidationHistory( result.snapshot_ids.insert(snapshot->snapshot_id); auto cached = SnapshotCache(snapshot.get()); - ICEBERG_ASSIGN_OR_RAISE( - auto manifests, content == ManifestContent::kData ? cached.DataManifests(io) - : cached.DeleteManifests(io)); + ICEBERG_ASSIGN_OR_RAISE(auto manifests, content == ManifestContent::kData + ? cached.DataManifests(io) + : cached.DeleteManifests(io)); for (const auto& manifest : manifests) { if (manifest.added_snapshot_id == snapshot->snapshot_id) { result.manifests.push_back(manifest); @@ -127,7 +128,8 @@ Result> FindMatchingDataFile( for (const auto& manifest : manifests) { ICEBERG_ASSIGN_OR_RAISE(auto spec, metadata.PartitionSpecById(manifest.partition_spec_id)); - ICEBERG_ASSIGN_OR_RAISE(auto reader, ManifestReader::Make(manifest, io, schema, spec)); + ICEBERG_ASSIGN_OR_RAISE(auto reader, + ManifestReader::Make(manifest, io, schema, spec)); reader->CaseSensitive(case_sensitive); if (filter != nullptr) { reader->FilterRows(filter); @@ -247,9 +249,8 @@ Status MergingSnapshotUpdate::AddDeleteFile(std::shared_ptr file, base().PartitionSpecById(file->partition_spec_id.value())); ICEBERG_RETURN_UNEXPECTED(added_delete_files_summary_.AddedFile(*spec, *file)); has_new_delete_files_ = true; - new_delete_files_.push_back( - PendingDeleteFile{.file = std::move(file), - .data_sequence_number = std::move(data_sequence_number)}); + new_delete_files_.push_back(PendingDeleteFile{ + .file = std::move(file), .data_sequence_number = std::move(data_sequence_number)}); return {}; } @@ -381,15 +382,17 @@ bool MergingSnapshotUpdate::DeletesDeleteFiles() const { ManifestWriterFactory MergingSnapshotUpdate::MakeTrackedWriterFactory( const std::shared_ptr& schema) { - return [this, schema](int32_t spec_id, - ManifestContent content) -> Result> { - const TableMetadata& meta = base(); - ICEBERG_ASSIGN_OR_RAISE(auto spec, meta.PartitionSpecById(spec_id)); - std::string path = ManifestPath(); - all_written_manifests_.insert(path); - return ManifestWriter::MakeWriter(meta.format_version, SnapshotId(), std::move(path), - ctx_->table->io(), std::move(spec), schema, content); - }; + return + [this, schema](int32_t spec_id, + ManifestContent content) -> Result> { + const TableMetadata& meta = base(); + ICEBERG_ASSIGN_OR_RAISE(auto spec, meta.PartitionSpecById(spec_id)); + std::string path = ManifestPath(); + all_written_manifests_.insert(path); + return ManifestWriter::MakeWriter(meta.format_version, SnapshotId(), + std::move(path), ctx_->table->io(), + std::move(spec), schema, content); + }; } Result> MergingSnapshotUpdate::WriteNewDataManifests() { @@ -454,8 +457,7 @@ Result> MergingSnapshotUpdate::WriteNewDeleteManifests .data_sequence_number = pending_file.data_sequence_number, }); } - ICEBERG_ASSIGN_OR_RAISE(auto written, - WriteDeleteManifests(delete_entries, spec)); + ICEBERG_ASSIGN_OR_RAISE(auto written, WriteDeleteManifests(delete_entries, spec)); for (const auto& m : written) { all_written_manifests_.insert(m.manifest_path); } @@ -487,9 +489,9 @@ Result> MergingSnapshotUpdate::Apply( auto tracked_factory = MakeTrackedWriterFactory(target_schema); // Step 1: Filter data manifests. - ICEBERG_ASSIGN_OR_RAISE(auto filtered_data, - data_filter_manager_.FilterManifests( - target_schema, metadata_to_update, snapshot, tracked_factory)); + ICEBERG_ASSIGN_OR_RAISE(auto filtered_data, data_filter_manager_.FilterManifests( + target_schema, metadata_to_update, + snapshot, tracked_factory)); // Track deleted data files in the summary builder. for (const auto& file : data_filter_manager_.FilesToBeDeleted()) { @@ -520,9 +522,9 @@ Result> MergingSnapshotUpdate::Apply( data_filter_manager_.FilesToBeDeleted()); // Step 3: Filter delete manifests. - ICEBERG_ASSIGN_OR_RAISE(auto filtered_deletes, - delete_filter_manager_.FilterManifests( - target_schema, metadata_to_update, snapshot, tracked_factory)); + ICEBERG_ASSIGN_OR_RAISE(auto filtered_deletes, delete_filter_manager_.FilterManifests( + target_schema, metadata_to_update, + snapshot, tracked_factory)); // Track deleted delete files in the summary builder. for (const auto& file : delete_filter_manager_.FilesToBeDeleted()) { @@ -656,18 +658,17 @@ Status MergingSnapshotUpdate::ValidateAddedDataFiles( } ICEBERG_ASSIGN_OR_RAISE( - auto history, - ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, - {DataOperation::kAppend, DataOperation::kOverwrite}, - ManifestContent::kData, io)); - ICEBERG_ASSIGN_OR_RAISE(auto conflict_path, - FindMatchingDataFile(metadata, history.manifests, - ManifestStatus::kAdded, filter, nullptr, io, - case_sensitive)); + auto history, ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + {DataOperation::kAppend, DataOperation::kOverwrite}, + ManifestContent::kData, io)); + ICEBERG_ASSIGN_OR_RAISE( + auto conflict_path, + FindMatchingDataFile(metadata, history.manifests, ManifestStatus::kAdded, filter, + nullptr, io, case_sensitive)); if (conflict_path.has_value()) { - return InvalidArgument("Found conflicting files that can contain rows matching {}: {}", - filter != nullptr ? filter->ToString() : "any expression", - conflict_path.value()); + return InvalidArgument( + "Found conflicting files that can contain rows matching {}: {}", + filter != nullptr ? filter->ToString() : "any expression", conflict_path.value()); } return {}; } @@ -682,9 +683,9 @@ Status MergingSnapshotUpdate::ValidateDataFilesExist( } ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); - ICEBERG_ASSIGN_OR_RAISE(auto ancestors, - ValidationAncestorsBetween(metadata, parent->snapshot_id, - starting_snapshot_id)); + ICEBERG_ASSIGN_OR_RAISE( + auto ancestors, + ValidationAncestorsBetween(metadata, parent->snapshot_id, starting_snapshot_id)); // Build the full set of matching snapshot IDs first, then scan their manifests. // The full set must be known before filtering manifests, since a manifest may have @@ -807,10 +808,9 @@ Status MergingSnapshotUpdate::ValidateAddedDataFiles( } ICEBERG_ASSIGN_OR_RAISE( - auto history, - ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, - {DataOperation::kAppend, DataOperation::kOverwrite}, - ManifestContent::kData, io)); + auto history, ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + {DataOperation::kAppend, DataOperation::kOverwrite}, + ManifestContent::kData, io)); ICEBERG_ASSIGN_OR_RAISE( auto conflict_path, FindMatchingDataFile(metadata, history.manifests, ManifestStatus::kAdded, nullptr, @@ -831,9 +831,8 @@ Status MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( return {}; } - ICEBERG_ASSIGN_OR_RAISE( - auto deletes, - AddedDeleteFiles(metadata, starting_snapshot_id, nullptr, nullptr, parent, io)); + ICEBERG_ASSIGN_OR_RAISE(auto deletes, AddedDeleteFiles(metadata, starting_snapshot_id, + nullptr, nullptr, parent, io)); if (deletes->empty()) { return {}; } @@ -847,9 +846,9 @@ Status MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); std::unique_ptr evaluator; if (data_filter != nullptr) { - ICEBERG_ASSIGN_OR_RAISE( - evaluator, InclusiveMetricsEvaluator::Make(data_filter, *schema, - /*case_sensitive=*/true)); + ICEBERG_ASSIGN_OR_RAISE(evaluator, + InclusiveMetricsEvaluator::Make(data_filter, *schema, + /*case_sensitive=*/true)); } for (const auto& data_file : replaced_files) { @@ -871,19 +870,18 @@ Status MergingSnapshotUpdate::ValidateNoNewDeletesForDataFiles( Status MergingSnapshotUpdate::ValidateNoNewDeleteFiles( const TableMetadata& metadata, int64_t starting_snapshot_id, - std::shared_ptr data_filter, - const std::shared_ptr& parent, std::shared_ptr io) { - ICEBERG_ASSIGN_OR_RAISE( - auto deletes, - AddedDeleteFiles(metadata, starting_snapshot_id, nullptr, nullptr, parent, io)); + std::shared_ptr data_filter, const std::shared_ptr& parent, + std::shared_ptr io) { + ICEBERG_ASSIGN_OR_RAISE(auto deletes, AddedDeleteFiles(metadata, starting_snapshot_id, + nullptr, nullptr, parent, io)); auto referenced_delete_files = deletes->ReferencedDeleteFiles(); ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); std::unique_ptr evaluator; if (data_filter != nullptr) { - ICEBERG_ASSIGN_OR_RAISE( - evaluator, InclusiveMetricsEvaluator::Make(data_filter, *schema, - /*case_sensitive=*/true)); + ICEBERG_ASSIGN_OR_RAISE(evaluator, + InclusiveMetricsEvaluator::Make(data_filter, *schema, + /*case_sensitive=*/true)); } for (const auto& delete_file : referenced_delete_files) { @@ -903,9 +901,8 @@ Status MergingSnapshotUpdate::ValidateNoNewDeleteFiles( const TableMetadata& metadata, int64_t starting_snapshot_id, const PartitionSet& partition_set, const std::shared_ptr& parent, std::shared_ptr io) { - ICEBERG_ASSIGN_OR_RAISE( - auto deletes, - AddedDeleteFiles(metadata, starting_snapshot_id, nullptr, nullptr, parent, io)); + ICEBERG_ASSIGN_OR_RAISE(auto deletes, AddedDeleteFiles(metadata, starting_snapshot_id, + nullptr, nullptr, parent, io)); auto referenced_delete_files = deletes->ReferencedDeleteFiles(); for (const auto& delete_file : referenced_delete_files) { if (!delete_file->partition_spec_id.has_value() || @@ -913,30 +910,30 @@ Status MergingSnapshotUpdate::ValidateNoNewDeleteFiles( delete_file->partition)) { continue; } - return InvalidArgument("Found new conflicting delete files in validated partitions: {}", - delete_file->file_path); + return InvalidArgument( + "Found new conflicting delete files in validated partitions: {}", + delete_file->file_path); } return {}; } Status MergingSnapshotUpdate::ValidateDeletedDataFiles( const TableMetadata& metadata, int64_t starting_snapshot_id, - std::shared_ptr data_filter, - const std::shared_ptr& parent, std::shared_ptr io) { + std::shared_ptr data_filter, const std::shared_ptr& parent, + std::shared_ptr io) { if (parent == nullptr) { return {}; } ICEBERG_ASSIGN_OR_RAISE( - auto history, - ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, - {DataOperation::kOverwrite, DataOperation::kReplace, - DataOperation::kDelete}, - ManifestContent::kData, io)); - ICEBERG_ASSIGN_OR_RAISE(auto conflict_path, - FindMatchingDataFile(metadata, history.manifests, - ManifestStatus::kDeleted, data_filter, - nullptr, io, /*case_sensitive=*/true)); + auto history, ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + {DataOperation::kOverwrite, DataOperation::kReplace, + DataOperation::kDelete}, + ManifestContent::kData, io)); + ICEBERG_ASSIGN_OR_RAISE( + auto conflict_path, + FindMatchingDataFile(metadata, history.manifests, ManifestStatus::kDeleted, + data_filter, nullptr, io, /*case_sensitive=*/true)); if (conflict_path.has_value()) { return InvalidArgument( "Found conflicting deleted files that can contain rows matching {}: {}", @@ -955,19 +952,17 @@ Status MergingSnapshotUpdate::ValidateDeletedDataFiles( } ICEBERG_ASSIGN_OR_RAISE( - auto history, - ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, - {DataOperation::kOverwrite, DataOperation::kReplace, - DataOperation::kDelete}, - ManifestContent::kData, io)); + auto history, ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + {DataOperation::kOverwrite, DataOperation::kReplace, + DataOperation::kDelete}, + ManifestContent::kData, io)); ICEBERG_ASSIGN_OR_RAISE( auto conflict_path, FindMatchingDataFile(metadata, history.manifests, ManifestStatus::kDeleted, nullptr, &partition_set, io, /*case_sensitive=*/true)); if (conflict_path.has_value()) { - return InvalidArgument( - "Found conflicting deleted files in validated partitions: {}", - conflict_path.value()); + return InvalidArgument("Found conflicting deleted files in validated partitions: {}", + conflict_path.value()); } return {}; } @@ -990,10 +985,9 @@ Result> MergingSnapshotUpdate::AddedDeleteFiles } ICEBERG_ASSIGN_OR_RAISE( - auto history, - ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, - {DataOperation::kOverwrite, DataOperation::kDelete}, - ManifestContent::kDeletes, io)); + auto history, ValidationHistory(metadata, parent->snapshot_id, starting_snapshot_id, + {DataOperation::kOverwrite, DataOperation::kDelete}, + ManifestContent::kDeletes, io)); // Compute the starting sequence number from the starting snapshot. int64_t starting_seq = TableMetadata::kInitialSequenceNumber; From b6388505440038b2ff4a71229c86b925c85dea3d Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Wed, 27 May 2026 15:25:56 +0800 Subject: [PATCH 4/4] Use ranges algorithms in parity updates Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/iceberg/test/merging_snapshot_update_test.cc | 2 +- src/iceberg/update/merging_snapshot_update.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/iceberg/test/merging_snapshot_update_test.cc b/src/iceberg/test/merging_snapshot_update_test.cc index a8317ddc3..ec3f09967 100644 --- a/src/iceberg/test/merging_snapshot_update_test.cc +++ b/src/iceberg/test/merging_snapshot_update_test.cc @@ -452,7 +452,7 @@ TEST_F(MergingSnapshotUpdateTest, AddDeleteFileWithExplicitSequenceWritesSequenc EXPECT_THAT(op->AddDelete(del_file, 17), IsOk()); ICEBERG_UNWRAP_OR_FAIL(auto manifests, op->Apply(*table_->metadata(), nullptr)); auto delete_manifest_it = - std::find_if(manifests.begin(), manifests.end(), [](const ManifestFile& manifest) { + std::ranges::find_if(manifests, [](const ManifestFile& manifest) { return manifest.content == ManifestContent::kDeletes; }); ASSERT_NE(delete_manifest_it, manifests.end()); diff --git a/src/iceberg/update/merging_snapshot_update.cc b/src/iceberg/update/merging_snapshot_update.cc index 93b169682..a25154e58 100644 --- a/src/iceberg/update/merging_snapshot_update.cc +++ b/src/iceberg/update/merging_snapshot_update.cc @@ -50,7 +50,7 @@ namespace { bool MatchesOperation(std::optional operation, std::initializer_list expected) { return operation.has_value() && - std::find(expected.begin(), expected.end(), operation.value()) != expected.end(); + std::ranges::find(expected, operation.value()) != expected.end(); } struct ValidationHistoryResult {