-
Notifications
You must be signed in to change notification settings - Fork 389
Description
Apache Iceberg Rust version
None
Describe the bug
Currently, TableUpdate::AddSpec always assigns new partition field IDs instead of reusing existing ones for equivalent (source_id, transform) pairs.
Related spec:
Partition field IDs must be reused if an existing partition spec contains an equivalent field.
When evolving a spec, changes should not cause partition field IDs to change because the partition field IDs are used as the partition tuple field IDs in manifest files.
The spec defines "equivalent field" as matching source_id, transform, and name, but the Java impl only checks (source_id, transform) pair (see TableMetadata.reassignPartitionIds). Regardless of which definition iceberg-rust adopts, it should implement some form of field ID reuse, currently it does neither.
Similar issues may exist in other code paths other than REST updates. A dedicated abstraction like a partition field ID allocator might be worth considering.
To Reproduce
Add the following to crates/iceberg/src/catalog/mod.rs
#[test]
fn test_rest_add_spec_should_reuse_field_id_for_equivalent_fields() {
// Create initial metadata with partition spec: identity(id) -> field_id = 1000
let schema = Schema::builder()
.with_fields(vec![
NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long)).into(),
NestedField::required(2, "data", Type::Primitive(PrimitiveType::String)).into(),
])
.build()
.unwrap();
let initial_partition_spec = UnboundPartitionSpec::builder()
.add_partition_field(1, "id", Transform::Identity)
.unwrap()
.build();
let initial_metadata = TableMetadataBuilder::new(
schema,
initial_partition_spec,
SortOrder::unsorted_order(),
"s3://bucket/table".to_string(),
FormatVersion::V2,
HashMap::new(),
)
.unwrap()
.build()
.unwrap()
.metadata;
// Verify initial spec has field_id = 1000
let first_spec = initial_metadata.default_partition_spec();
assert_eq!(first_spec.fields().len(), 1);
assert_eq!(first_spec.fields()[0].field_id, 1000);
assert_eq!(first_spec.fields()[0].source_id, 1);
assert_eq!(first_spec.fields()[0].transform, Transform::Identity);
// Simulate REST API request: TableUpdate::AddSpec
let rest_request = TableUpdate::AddSpec {
spec: UnboundPartitionSpec::builder()
.add_partition_field(1, "id", Transform::Identity) // Same as spec 0's field
.unwrap()
.add_partition_field(2, "data_bucket", Transform::Bucket(2)) // New field
.unwrap()
.build(),
};
let builder =
initial_metadata.into_builder(Some("s3://bucket/table/metadata/v1.json".to_string()));
let build_result = rest_request.apply(builder).unwrap().build().unwrap();
// Verify the new spec was created with spec_id = 1
let second_spec = build_result.metadata.partition_spec_by_id(1).unwrap();
assert_eq!(second_spec.fields().len(), 2);
// Verify field_id reuse for equivalent field
// Expected: 1000 (reused from first spec)
// Actual: 1001 (incorrectly assigned new ID)
assert_eq!(second_spec.fields()[0].field_id, 1000);
}Expected behavior
Given:
partition spec 0: (1, identity) -> field_id = 1000
Expected:
partition spec 0: (1, identity) -> field_id = 1000
partition spec 1: (1, identity) -> field_id = 1000 (reused)
(2, bucket) -> field_id = 1001 (new)
Actual:
partition spec 0: (1, identity) -> field_id = 1000
partition spec 1: (1, identity) -> field_id = 1001 (incorrectly assigned new ID)
(2, bucket) -> field_id = 1002 (new)
Willingness to contribute
None