Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions src/iceberg/parquet/parquet_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,7 @@ Result<SchemaProjection> BuildProjection(::parquet::arrow::FileReader* reader,
const Schema& read_schema) {
auto metadata = reader->parquet_reader()->metadata();

ICEBERG_ASSIGN_OR_RAISE(auto has_field_ids,
HasFieldIds(metadata->schema()->schema_root()));
if (!has_field_ids) {
if (!HasFieldIds(metadata->schema()->schema_root())) {
// TODO(gangwu): apply name mapping to Parquet schema
return NotImplemented("Applying name mapping to Parquet schema is not implemented");
}
Expand Down
20 changes: 18 additions & 2 deletions src/iceberg/parquet/parquet_schema_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@
* under the License.
*/

#include <parquet/schema.h>

#include "iceberg/parquet/parquet_schema_util_internal.h"
#include "iceberg/util/checked_cast.h"

namespace iceberg::parquet {

Expand All @@ -30,8 +33,21 @@ Result<std::vector<int>> SelectedColumnIndices(const SchemaProjection& projectio
return NotImplemented("NYI");
}

Result<bool> HasFieldIds(const ::parquet::schema::NodePtr& root_node) {
return NotImplemented("NYI");
bool HasFieldIds(const ::parquet::schema::NodePtr& node) {
if (node->field_id() >= 0) {
return true;
}

if (node->is_group()) {
auto group_node = internal::checked_pointer_cast<::parquet::schema::GroupNode>(node);
for (int i = 0; i < group_node->field_count(); i++) {
if (HasFieldIds(group_node->field(i))) {
return true;
}
}
Comment on lines +42 to +47
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this means? Any leaf has field-id?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just simply check if any field (whether leaf or non-leaf) has field-id. This is in sync with the Java impl.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we cannot blindly check if all fields have ids because it is valid to have some fields that do not have ids, especially when variant type is supported.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I knew that for 3-levels complex type, fieldId might not exists, but here I don't fully understand the purpose, so any leaf has field-id means writer is field-id aware?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Iceberg only supports three-level list encoding per the spec: https://iceberg.apache.org/spec/#parquet

Lists must use the 3-level representation.

}

return false;
}

} // namespace iceberg::parquet
5 changes: 2 additions & 3 deletions src/iceberg/parquet/parquet_schema_util_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ Result<std::vector<int>> SelectedColumnIndices(const SchemaProjection& projectio
/// \brief Check whether the Parquet schema has field IDs.
///
/// \param root_node The root node of the Parquet schema.
/// \return True if the Parquet schema has field IDs, false otherwise. Return error if
/// the Parquet schema has partial field IDs.
Result<bool> HasFieldIds(const ::parquet::schema::NodePtr& root_node);
/// \return True if the Parquet schema has field IDs, false otherwise.
bool HasFieldIds(const ::parquet::schema::NodePtr& root_node);

} // namespace iceberg::parquet
2 changes: 2 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,6 @@ if(ICEBERG_BUILD_BUNDLE)
SOURCES
test_common.cc
in_memory_catalog_test.cc)

add_iceberg_test(parquet_test USE_BUNDLE SOURCES parquet_schema_test.cc)
endif()
66 changes: 66 additions & 0 deletions test/parquet_schema_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include <gtest/gtest.h>
#include <parquet/schema.h>
#include <parquet/types.h>

#include "iceberg/parquet/parquet_schema_util_internal.h"

namespace iceberg::parquet {

namespace {

::parquet::schema::NodePtr MakeInt32Node(const std::string& name, int field_id = -1) {
return ::parquet::schema::PrimitiveNode::Make(
name, ::parquet::Repetition::REQUIRED, ::parquet::LogicalType::None(),
::parquet::Type::INT32, /*primitive_length=*/-1, field_id);
}

::parquet::schema::NodePtr MakeGroupNode(const std::string& name,
const ::parquet::schema::NodeVector& fields,
int field_id = -1) {
return ::parquet::schema::GroupNode::Make(name, ::parquet::Repetition::REQUIRED, fields,
/*logical_type=*/nullptr, field_id);
}

} // namespace

TEST(HasFieldIds, PrimitiveNode) {
EXPECT_FALSE(HasFieldIds(MakeInt32Node("test_field")));
EXPECT_TRUE(HasFieldIds(MakeInt32Node("test_field", /*field_id=*/1)));
}

TEST(HasFieldIds, GroupNode) {
auto group_node_without_field_id =
MakeGroupNode("test_group", {MakeInt32Node("c1"), MakeInt32Node("c2")});
EXPECT_FALSE(HasFieldIds(group_node_without_field_id));

auto group_node_with_full_field_id = MakeGroupNode(
"test_group",
{MakeInt32Node("c1", /*field_id=*/2), MakeInt32Node("c2", /*field_id=*/3)},
/*field_id=*/1);
EXPECT_TRUE(HasFieldIds(group_node_with_full_field_id));

auto group_node_with_partial_field_id = MakeGroupNode(
"test_group", {MakeInt32Node("c1", /*field_id=*/1), MakeInt32Node("c2")});
EXPECT_TRUE(HasFieldIds(group_node_with_partial_field_id));
}

} // namespace iceberg::parquet
Loading