diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 46d7fe6b8b..45d24b55d6 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -201,6 +201,7 @@ PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id" # ORC field ID key for Iceberg field IDs in ORC metadata ORC_FIELD_ID_KEY = b"iceberg.id" +ORC_FIELD_REQUIRED_KEY = b"iceberg.required" PYARROW_FIELD_DOC_KEY = b"doc" LIST_ELEMENT_NAME = "element" MAP_KEY_NAME = "key" @@ -722,6 +723,8 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field: else: # Default to Parquet for backward compatibility metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id) + if self._file_format == FileFormat.ORC: + metadata[ORC_FIELD_REQUIRED_KEY] = str(field.required).lower() return pa.field( name=field.name, diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 3765ea6de6..4b4c19357c 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -3840,8 +3840,46 @@ def test_orc_schema_conversion_with_field_ids() -> None: id_field_no_ids = arrow_schema_no_ids.field(0) name_field_no_ids = arrow_schema_no_ids.field(1) - assert not id_field_no_ids.metadata - assert not name_field_no_ids.metadata + assert ORC_FIELD_ID_KEY not in id_field_no_ids.metadata + assert ORC_FIELD_ID_KEY not in name_field_no_ids.metadata + assert PYARROW_PARQUET_FIELD_ID_KEY not in id_field_no_ids.metadata + assert PYARROW_PARQUET_FIELD_ID_KEY not in name_field_no_ids.metadata + + +def test_orc_schema_conversion_with_required_attribute() -> None: + """ + Test that schema_to_pyarrow correctly adds ORC iceberg.required attribute. + To run just this test: + pytest tests/io/test_pyarrow.py -k test_orc_schema_conversion_with_required_attribute + """ + from pyiceberg.io.pyarrow import ORC_FIELD_REQUIRED_KEY, schema_to_pyarrow + from pyiceberg.manifest import FileFormat + from pyiceberg.schema import Schema + from pyiceberg.types import IntegerType, StringType + + # Define schema + schema = Schema( + NestedField(1, "id", IntegerType(), required=True), + NestedField(2, "name", StringType(), required=False), + ) + + # Test 1: Specify Parquet format + arrow_schema_default = schema_to_pyarrow(schema, file_format=FileFormat.PARQUET) + + id_field = arrow_schema_default.field(0) + name_field = arrow_schema_default.field(1) + + assert ORC_FIELD_REQUIRED_KEY not in id_field.metadata + assert ORC_FIELD_REQUIRED_KEY not in name_field.metadata + + # Test 2: Specify ORC format + arrow_schema_orc = schema_to_pyarrow(schema, file_format=FileFormat.ORC) + + id_field_orc = arrow_schema_orc.field(0) + name_field_orc = arrow_schema_orc.field(1) + + assert id_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"true" + assert name_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"false" def test_orc_batching_behavior_documentation(tmp_path: Path) -> None: