From b9042779b868d676181688cf2d87fccbfc821d6a Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 1 Jun 2026 17:22:26 +0800 Subject: [PATCH 1/6] fix: add move constructor for RowRanges --- src/paimon/format/parquet/row_ranges.cpp | 4 ++-- src/paimon/format/parquet/row_ranges.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/paimon/format/parquet/row_ranges.cpp b/src/paimon/format/parquet/row_ranges.cpp index 1b03715be..40b2f46b3 100644 --- a/src/paimon/format/parquet/row_ranges.cpp +++ b/src/paimon/format/parquet/row_ranges.cpp @@ -44,11 +44,11 @@ RowRanges RowRanges::Union(const RowRanges& left, const RowRanges& right) { combined.reserve(left.ranges_.size() + right.ranges_.size()); combined.insert(combined.end(), left.ranges_.begin(), left.ranges_.end()); combined.insert(combined.end(), right.ranges_.begin(), right.ranges_.end()); - return RowRanges(Range::SortAndMergeOverlap(combined, /*adjacent=*/true)); + return RowRanges(std::move(Range::SortAndMergeOverlap(combined, /*adjacent=*/true))); } RowRanges RowRanges::Intersection(const RowRanges& left, const RowRanges& right) { - return RowRanges(Range::And(left.ranges_, right.ranges_)); + return RowRanges(std::move(Range::And(left.ranges_, right.ranges_))); } int64_t RowRanges::RowCount() const { diff --git a/src/paimon/format/parquet/row_ranges.h b/src/paimon/format/parquet/row_ranges.h index 288fa48f4..a9027050f 100644 --- a/src/paimon/format/parquet/row_ranges.h +++ b/src/paimon/format/parquet/row_ranges.h @@ -43,6 +43,9 @@ class RowRanges { /// Creates a RowRanges from a list of ranges. explicit RowRanges(const std::vector& ranges) : ranges_(ranges) {} + /// Creates a RowRanges from a list of ranges, taking ownership of the vector. + explicit RowRanges(const std::vector&& ranges) : ranges_(std::move(ranges)) {} + /// Creates a RowRanges with a single range [0, row_count - 1]. static RowRanges CreateSingle(int64_t row_count) { if (row_count <= 0) { From 84d73244fd1459b676f61402f3b8da7f06d9358c Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 2 Jun 2026 11:56:42 +0800 Subject: [PATCH 2/6] fix: add WhenBuffered after PreBuffer is called --- src/paimon/format/parquet/page_filtered_row_group_reader.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp index 4594717f0..fa311eb31 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp @@ -249,6 +249,8 @@ Result> PageFilteredRowGroupReader::Re // Pre-buffering failed, fall back to row-group level PreBuffer ::arrow::io::IOContext io_ctx(pool); parquet_reader->PreBuffer(rg_vec, col_vec, io_ctx, cache_options); + PAIMON_RETURN_NOT_OK_FROM_ARROW( + parquet_reader->WhenBuffered(rg_vec, col_vec).status()); } } else { PAIMON_RETURN_NOT_OK_FROM_ARROW(parquet_reader->WhenBuffered(rg_vec, col_vec).status()); From ffebaedd51e0bd4939b8a0341c270c6b278170c7 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 2 Jun 2026 14:00:07 +0800 Subject: [PATCH 3/6] fix(ColumnIndexFilter): return all row when literal is empty --- .../format/parquet/column_index_filter.cpp | 31 ++++++------------- .../parquet/column_index_filter_test.cpp | 28 +++++++++++++++++ 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/src/paimon/format/parquet/column_index_filter.cpp b/src/paimon/format/parquet/column_index_filter.cpp index cf638cf6d..c2f9ab618 100644 --- a/src/paimon/format/parquet/column_index_filter.cpp +++ b/src/paimon/format/parquet/column_index_filter.cpp @@ -96,6 +96,10 @@ Result ColumnIndexFilter::VisitLeafPredicate( const auto& literals = leaf_predicate->Literals(); FieldType field_type = leaf_predicate->GetFieldType(); + if (function_type != Function::Type::IS_NULL && function_type != Function::Type::IS_NOT_NULL && + literals.empty()) { + return RowRanges::CreateSingle(row_group_row_count); + } std::vector matching_pages; switch (function_type) { @@ -106,37 +110,22 @@ Result ColumnIndexFilter::VisitLeafPredicate( matching_pages = FilterPagesByIsNotNull(column_index_ptr); break; case Function::Type::EQUAL: - if (!literals.empty()) { - matching_pages = FilterPagesByEqual(column_index_ptr, literals[0], field_type); - } + matching_pages = FilterPagesByEqual(column_index_ptr, literals[0], field_type); break; case Function::Type::NOT_EQUAL: - if (!literals.empty()) { - matching_pages = FilterPagesByNotEqual(column_index_ptr, literals[0], field_type); - } + matching_pages = FilterPagesByNotEqual(column_index_ptr, literals[0], field_type); break; case Function::Type::LESS_THAN: - if (!literals.empty()) { - matching_pages = FilterPagesByLessThan(column_index_ptr, literals[0], field_type); - } + matching_pages = FilterPagesByLessThan(column_index_ptr, literals[0], field_type); break; case Function::Type::LESS_OR_EQUAL: - if (!literals.empty()) { - matching_pages = - FilterPagesByLessOrEqual(column_index_ptr, literals[0], field_type); - } + matching_pages = FilterPagesByLessOrEqual(column_index_ptr, literals[0], field_type); break; case Function::Type::GREATER_THAN: - if (!literals.empty()) { - matching_pages = - FilterPagesByGreaterThan(column_index_ptr, literals[0], field_type); - } + matching_pages = FilterPagesByGreaterThan(column_index_ptr, literals[0], field_type); break; case Function::Type::GREATER_OR_EQUAL: - if (!literals.empty()) { - matching_pages = - FilterPagesByGreaterOrEqual(column_index_ptr, literals[0], field_type); - } + matching_pages = FilterPagesByGreaterOrEqual(column_index_ptr, literals[0], field_type); break; case Function::Type::IN: matching_pages = FilterPagesByIn(column_index_ptr, literals, field_type); diff --git a/src/paimon/format/parquet/column_index_filter_test.cpp b/src/paimon/format/parquet/column_index_filter_test.cpp index 8249f6356..321d6b4e8 100644 --- a/src/paimon/format/parquet/column_index_filter_test.cpp +++ b/src/paimon/format/parquet/column_index_filter_test.cpp @@ -26,6 +26,9 @@ #include "arrow/c/abi.h" #include "arrow/c/bridge.h" #include "gtest/gtest.h" +#include "paimon/common/predicate/equal.h" +#include "paimon/common/predicate/in.h" +#include "paimon/common/predicate/leaf_predicate_impl.h" #include "paimon/common/utils/arrow/arrow_input_stream_adapter.h" #include "paimon/common/utils/arrow/mem_utils.h" #include "paimon/defs.h" @@ -480,4 +483,29 @@ TEST_F(ColumnIndexFilterTest, NullPredicateReturnsAllRows) { EXPECT_EQ(row_group_row_count_, ranges.RowCount()); } +/// When literals is empty for comparison predicates (EQUAL, NOT_EQUAL, LESS_THAN, +/// LESS_OR_EQUAL, GREATER_THAN, GREATER_OR_EQUAL), the filter should return all +/// rows (conservative fallback) rather than returning empty ranges. +TEST_F(ColumnIndexFilterTest, EmptyLiteralsReturnsAllRows) { + // Construct a LeafPredicate with EQUAL function but empty literals vector. + // This simulates the edge case where literals are unexpectedly empty. + auto pred = std::make_shared(paimon::Equal::Instance(), 0, "val", + FieldType::INT, std::vector()); + ASSERT_OK_AND_ASSIGN(auto ranges, Filter(pred)); + // With empty literals, the filter cannot evaluate the comparison, + // so it should conservatively return all rows. + EXPECT_EQ(row_group_row_count_, ranges.RowCount()); +} + +/// Empty literals for IN predicate — the early guard in VisitLeafPredicate treats +/// all non-IS_NULL/IS_NOT_NULL predicates with empty literals conservatively, +/// returning all rows rather than risking incorrect filtering. +TEST_F(ColumnIndexFilterTest, EmptyLiteralsInReturnsAllRows) { + auto pred = std::make_shared(paimon::In::Instance(), 0, "val", + FieldType::INT, std::vector()); + ASSERT_OK_AND_ASSIGN(auto ranges, Filter(pred)); + // Empty literals → conservative fallback → all rows. + EXPECT_EQ(row_group_row_count_, ranges.RowCount()); +} + } // namespace paimon::parquet::test From 7b7fd11c89a8b21e3346756a5a57fd71d6838223 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 2 Jun 2026 14:04:17 +0800 Subject: [PATCH 4/6] style: delete duplicated macro and move it to common header files --- src/paimon/format/parquet/file_reader_wrapper.cpp | 12 +----------- .../format/parquet/parquet_file_batch_reader.cpp | 11 ----------- src/paimon/format/parquet/parquet_format_defs.h | 11 +++++++++++ 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/src/paimon/format/parquet/file_reader_wrapper.cpp b/src/paimon/format/parquet/file_reader_wrapper.cpp index bfabb9f86..98c5a9f38 100644 --- a/src/paimon/format/parquet/file_reader_wrapper.cpp +++ b/src/paimon/format/parquet/file_reader_wrapper.cpp @@ -26,23 +26,13 @@ #include "fmt/format.h" #include "paimon/format/parquet/column_index_filter.h" #include "paimon/format/parquet/page_filtered_row_group_reader.h" +#include "paimon/format/parquet/parquet_format_defs.h" #include "paimon/macros.h" #include "parquet/arrow/reader.h" #include "parquet/file_reader.h" #include "parquet/metadata.h" #include "parquet/page_index.h" -// Convert any std::exception thrown by underlying Parquet/Arrow APIs into a -// Status. Used as the trailing catch clauses of a try block in every public -// method that calls into the parquet C++ API, so the read layer never throws. -#define PAIMON_PARQUET_CATCH_AND_RETURN_STATUS(context) \ - catch (const std::exception& e) { \ - return Status::Invalid(fmt::format("{}: {}", (context), e.what())); \ - } \ - catch (...) { \ - return Status::UnknownError((context), ": unknown error"); \ - } - namespace paimon::parquet { namespace { diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index c37570acb..3c78720f1 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -48,17 +48,6 @@ #include "parquet/arrow/reader.h" #include "parquet/properties.h" -// Convert any std::exception thrown by underlying Parquet/Arrow APIs into a -// Status. Used as the trailing catch clauses of a try block in every public -// method that calls into the parquet C++ API, so the read layer never throws. -#define PAIMON_PARQUET_CATCH_AND_RETURN_STATUS(context) \ - catch (const std::exception& e) { \ - return Status::Invalid(fmt::format("{}: {}", (context), e.what())); \ - } \ - catch (...) { \ - return Status::UnknownError((context), ": unknown error"); \ - } - namespace arrow { class MemoryPool; } // namespace arrow diff --git a/src/paimon/format/parquet/parquet_format_defs.h b/src/paimon/format/parquet/parquet_format_defs.h index b979b5780..b646ca6c2 100644 --- a/src/paimon/format/parquet/parquet_format_defs.h +++ b/src/paimon/format/parquet/parquet_format_defs.h @@ -19,6 +19,17 @@ #include #include +// Convert any std::exception thrown by underlying Parquet/Arrow APIs into a +// Status. Used as the trailing catch clauses of a try block in every public +// method that calls into the parquet C++ API, so the read layer never throws. +#define PAIMON_PARQUET_CATCH_AND_RETURN_STATUS(context) \ + catch (const std::exception& e) { \ + return Status::Invalid(fmt::format("{}: {}", (context), e.what())); \ + } \ + catch (...) { \ + return Status::UnknownError((context), ": unknown error"); \ + } + namespace paimon::parquet { // write From e20b0308dd821106b1c933d982fd7c15936fa20c Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 2 Jun 2026 14:09:20 +0800 Subject: [PATCH 5/6] fix: compute 'column_name_to_index' only when needed --- .../parquet/parquet_file_batch_reader.cpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index 3c78720f1..8da49efd6 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -148,18 +148,6 @@ Status ParquetFileBatchReader::SetReadSchema( } } - // Build column name to index map for page-level filtering. - // For leaf columns, indices[0] is the correct leaf column index in Parquet. - // For nested types (struct/list/map), FlattenSchema produces multiple leaf indices, - // but predicate pushdown only targets leaf columns with simple types, so indices[0] - // is always the correct single leaf index for predicate evaluation. - std::map column_name_to_index; - for (const auto& [name, indices] : field_index_map) { - if (!indices.empty()) { - column_name_to_index[name] = indices[0]; - } - } - std::vector row_groups = arrow::internal::Iota(reader_->GetNumberOfRowGroups()); if (predicate) { PAIMON_ASSIGN_OR_RAISE(row_groups, @@ -177,6 +165,18 @@ Status ParquetFileBatchReader::SetReadSchema( OptionsUtils::GetValueFromMap(options_, PARQUET_READ_ENABLE_PAGE_INDEX_FILTER, DEFAULT_PARQUET_READ_ENABLE_PAGE_INDEX_FILTER)); if (enable_page_index_filter) { + // Build column name to index map for page-level filtering. + // For leaf columns, indices[0] is the correct leaf column index in Parquet. + // For nested types (struct/list/map), FlattenSchema produces multiple leaf indices, + // but predicate pushdown only targets leaf columns with simple types, so indices[0] + // is always the correct single leaf index for predicate evaluation. + std::map column_name_to_index; + for (const auto& [name, indices] : field_index_map) { + if (!indices.empty()) { + column_name_to_index[name] = indices[0]; + } + } + PAIMON_ASSIGN_OR_RAISE( auto page_filter_result, FilterRowGroupsByPageIndex(predicate, column_name_to_index, row_groups)); From 6bb45ba2f244a565cb3bf76e32fd9f37923a50a5 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 2 Jun 2026 14:58:06 +0800 Subject: [PATCH 6/6] fix: removing 'const' in the move constructor of RowRanges --- src/paimon/format/parquet/row_ranges.cpp | 4 ++-- src/paimon/format/parquet/row_ranges.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/paimon/format/parquet/row_ranges.cpp b/src/paimon/format/parquet/row_ranges.cpp index 40b2f46b3..1b03715be 100644 --- a/src/paimon/format/parquet/row_ranges.cpp +++ b/src/paimon/format/parquet/row_ranges.cpp @@ -44,11 +44,11 @@ RowRanges RowRanges::Union(const RowRanges& left, const RowRanges& right) { combined.reserve(left.ranges_.size() + right.ranges_.size()); combined.insert(combined.end(), left.ranges_.begin(), left.ranges_.end()); combined.insert(combined.end(), right.ranges_.begin(), right.ranges_.end()); - return RowRanges(std::move(Range::SortAndMergeOverlap(combined, /*adjacent=*/true))); + return RowRanges(Range::SortAndMergeOverlap(combined, /*adjacent=*/true)); } RowRanges RowRanges::Intersection(const RowRanges& left, const RowRanges& right) { - return RowRanges(std::move(Range::And(left.ranges_, right.ranges_))); + return RowRanges(Range::And(left.ranges_, right.ranges_)); } int64_t RowRanges::RowCount() const { diff --git a/src/paimon/format/parquet/row_ranges.h b/src/paimon/format/parquet/row_ranges.h index a9027050f..401e49574 100644 --- a/src/paimon/format/parquet/row_ranges.h +++ b/src/paimon/format/parquet/row_ranges.h @@ -44,7 +44,7 @@ class RowRanges { explicit RowRanges(const std::vector& ranges) : ranges_(ranges) {} /// Creates a RowRanges from a list of ranges, taking ownership of the vector. - explicit RowRanges(const std::vector&& ranges) : ranges_(std::move(ranges)) {} + explicit RowRanges(std::vector&& ranges) : ranges_(std::move(ranges)) {} /// Creates a RowRanges with a single range [0, row_count - 1]. static RowRanges CreateSingle(int64_t row_count) {