Skip to content

V3 manifest writer does not write content metadata field #1973

@yshcz

Description

@yshcz

Apache Iceberg Rust version

None

Describe the bug

The spec requires that manifests written in format version 2+ must include the content field in the Avro file key-value metadata.

Currently the writer only writes the content metadata for V2 manifests. V3 manifests are missing this required field. This causes a roundtrip problem where V3 delete manifests written by iceberg-rust are read back as data manifests.

Probably the fix is just updating the condition that writes the content metadata field to include V3.

To Reproduce

Add the following test at crates/iceberg/src/spec/manifest/mod.rs

#[tokio::test]
async fn test_v3_delete_manifest_delte_file_roundtrip() {
    let schema = Arc::new(
        Schema::builder()
            .with_fields(vec![
                Arc::new(NestedField::optional(
                    1,
                    "id",
                    Type::Primitive(PrimitiveType::Long),
                )),
                Arc::new(NestedField::optional(
                    2,
                    "data",
                    Type::Primitive(PrimitiveType::String),
                )),
            ])
            .build()
            .unwrap(),
    );

    let partition_spec = PartitionSpec::builder(schema.clone())
        .with_spec_id(0)
        .build()
        .unwrap();

    // Create a position delete file entry
    let delete_entry = ManifestEntry {
        status: ManifestStatus::Added,
        snapshot_id: None,
        sequence_number: None,
        file_sequence_number: None,
        data_file: DataFile {
            content: DataContentType::PositionDeletes,
            file_path: "s3://bucket/table/data/delete-00000.parquet".to_string(),
            file_format: DataFileFormat::Parquet,
            partition: Struct::empty(),
            record_count: 10,
            file_size_in_bytes: 1024,
            column_sizes: HashMap::new(),
            value_counts: HashMap::new(),
            null_value_counts: HashMap::new(),
            nan_value_counts: HashMap::new(),
            lower_bounds: HashMap::new(),
            upper_bounds: HashMap::new(),
            key_metadata: None,
            split_offsets: None,
            equality_ids: None,
            sort_order_id: None,
            partition_spec_id: 0,
            first_row_id: None,
            referenced_data_file: None,
            content_offset: None,
            content_size_in_bytes: None,
        },
    };

    // Write a V3 delete manifest
    let tmp_dir = TempDir::new().unwrap();
    let path = tmp_dir.path().join("v3_delete_manifest.avro");
    let io = FileIOBuilder::new_fs_io().build().unwrap();
    let output_file = io.new_output(path.to_str().unwrap()).unwrap();

    let mut writer = ManifestWriterBuilder::new(
        output_file,
        Some(1),
        None,
        schema.clone(),
        partition_spec.clone(),
    )
    .build_v3_deletes();

    writer.add_entry(delete_entry).unwrap();
    let manifest_file = writer.write_manifest_file().await.unwrap();

    // The returned ManifestFile correctly reports Deletes content
    assert_eq!(manifest_file.content, ManifestContentType::Deletes);

    // Read back the manifest file
    let actual_manifest =
        Manifest::parse_avro(fs::read(&path).expect("read_file must succeed").as_slice())
            .unwrap();

    // The content type reads as Data due to the bug.
    assert_eq!(
        actual_manifest.metadata().content,
        ManifestContentType::Data,
    );

    // Expected:
    // assert_eq!(
    //     actual_manifest.metadata().content,
    //     ManifestContentType::Deletes,
    // );
}

Expected behavior

No response

Willingness to contribute

None

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions