From 5a38bd5f5a7e76a72925e1bccd771ca9955218b5 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 3 Aug 2025 22:54:09 +0800 Subject: [PATCH] feat(parquet): add HasFieldIds check --- src/iceberg/parquet/parquet_reader.cc | 4 +- src/iceberg/parquet/parquet_schema_util.cc | 20 +++++- .../parquet/parquet_schema_util_internal.h | 5 +- test/CMakeLists.txt | 2 + test/parquet_schema_test.cc | 66 +++++++++++++++++++ 5 files changed, 89 insertions(+), 8 deletions(-) create mode 100644 test/parquet_schema_test.cc diff --git a/src/iceberg/parquet/parquet_reader.cc b/src/iceberg/parquet/parquet_reader.cc index 3ee260a44..282ce5ee8 100644 --- a/src/iceberg/parquet/parquet_reader.cc +++ b/src/iceberg/parquet/parquet_reader.cc @@ -61,9 +61,7 @@ Result BuildProjection(::parquet::arrow::FileReader* reader, const Schema& read_schema) { auto metadata = reader->parquet_reader()->metadata(); - ICEBERG_ASSIGN_OR_RAISE(auto has_field_ids, - HasFieldIds(metadata->schema()->schema_root())); - if (!has_field_ids) { + if (!HasFieldIds(metadata->schema()->schema_root())) { // TODO(gangwu): apply name mapping to Parquet schema return NotImplemented("Applying name mapping to Parquet schema is not implemented"); } diff --git a/src/iceberg/parquet/parquet_schema_util.cc b/src/iceberg/parquet/parquet_schema_util.cc index 8648fa931..05c712273 100644 --- a/src/iceberg/parquet/parquet_schema_util.cc +++ b/src/iceberg/parquet/parquet_schema_util.cc @@ -17,7 +17,10 @@ * under the License. */ +#include + #include "iceberg/parquet/parquet_schema_util_internal.h" +#include "iceberg/util/checked_cast.h" namespace iceberg::parquet { @@ -30,8 +33,21 @@ Result> SelectedColumnIndices(const SchemaProjection& projectio return NotImplemented("NYI"); } -Result HasFieldIds(const ::parquet::schema::NodePtr& root_node) { - return NotImplemented("NYI"); +bool HasFieldIds(const ::parquet::schema::NodePtr& node) { + if (node->field_id() >= 0) { + return true; + } + + if (node->is_group()) { + auto group_node = internal::checked_pointer_cast<::parquet::schema::GroupNode>(node); + for (int i = 0; i < group_node->field_count(); i++) { + if (HasFieldIds(group_node->field(i))) { + return true; + } + } + } + + return false; } } // namespace iceberg::parquet diff --git a/src/iceberg/parquet/parquet_schema_util_internal.h b/src/iceberg/parquet/parquet_schema_util_internal.h index f3b0f3706..84e71ab4f 100644 --- a/src/iceberg/parquet/parquet_schema_util_internal.h +++ b/src/iceberg/parquet/parquet_schema_util_internal.h @@ -47,8 +47,7 @@ Result> SelectedColumnIndices(const SchemaProjection& projectio /// \brief Check whether the Parquet schema has field IDs. /// /// \param root_node The root node of the Parquet schema. -/// \return True if the Parquet schema has field IDs, false otherwise. Return error if -/// the Parquet schema has partial field IDs. -Result HasFieldIds(const ::parquet::schema::NodePtr& root_node); +/// \return True if the Parquet schema has field IDs, false otherwise. +bool HasFieldIds(const ::parquet::schema::NodePtr& root_node); } // namespace iceberg::parquet diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 51fc776c2..091fa292d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -116,4 +116,6 @@ if(ICEBERG_BUILD_BUNDLE) SOURCES test_common.cc in_memory_catalog_test.cc) + + add_iceberg_test(parquet_test USE_BUNDLE SOURCES parquet_schema_test.cc) endif() diff --git a/test/parquet_schema_test.cc b/test/parquet_schema_test.cc new file mode 100644 index 000000000..1a280ff4d --- /dev/null +++ b/test/parquet_schema_test.cc @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "iceberg/parquet/parquet_schema_util_internal.h" + +namespace iceberg::parquet { + +namespace { + +::parquet::schema::NodePtr MakeInt32Node(const std::string& name, int field_id = -1) { + return ::parquet::schema::PrimitiveNode::Make( + name, ::parquet::Repetition::REQUIRED, ::parquet::LogicalType::None(), + ::parquet::Type::INT32, /*primitive_length=*/-1, field_id); +} + +::parquet::schema::NodePtr MakeGroupNode(const std::string& name, + const ::parquet::schema::NodeVector& fields, + int field_id = -1) { + return ::parquet::schema::GroupNode::Make(name, ::parquet::Repetition::REQUIRED, fields, + /*logical_type=*/nullptr, field_id); +} + +} // namespace + +TEST(HasFieldIds, PrimitiveNode) { + EXPECT_FALSE(HasFieldIds(MakeInt32Node("test_field"))); + EXPECT_TRUE(HasFieldIds(MakeInt32Node("test_field", /*field_id=*/1))); +} + +TEST(HasFieldIds, GroupNode) { + auto group_node_without_field_id = + MakeGroupNode("test_group", {MakeInt32Node("c1"), MakeInt32Node("c2")}); + EXPECT_FALSE(HasFieldIds(group_node_without_field_id)); + + auto group_node_with_full_field_id = MakeGroupNode( + "test_group", + {MakeInt32Node("c1", /*field_id=*/2), MakeInt32Node("c2", /*field_id=*/3)}, + /*field_id=*/1); + EXPECT_TRUE(HasFieldIds(group_node_with_full_field_id)); + + auto group_node_with_partial_field_id = MakeGroupNode( + "test_group", {MakeInt32Node("c1", /*field_id=*/1), MakeInt32Node("c2")}); + EXPECT_TRUE(HasFieldIds(group_node_with_partial_field_id)); +} + +} // namespace iceberg::parquet