Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@
PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
# ORC field ID key for Iceberg field IDs in ORC metadata
ORC_FIELD_ID_KEY = b"iceberg.id"
ORC_FIELD_REQUIRED_KEY = b"iceberg.required"
PYARROW_FIELD_DOC_KEY = b"doc"
LIST_ELEMENT_NAME = "element"
MAP_KEY_NAME = "key"
Expand Down Expand Up @@ -722,6 +723,8 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
else:
# Default to Parquet for backward compatibility
metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
if self._file_format == FileFormat.ORC:
metadata[ORC_FIELD_REQUIRED_KEY] = str(field.required).lower()

return pa.field(
name=field.name,
Expand Down
42 changes: 40 additions & 2 deletions tests/io/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3840,8 +3840,46 @@ def test_orc_schema_conversion_with_field_ids() -> None:
id_field_no_ids = arrow_schema_no_ids.field(0)
name_field_no_ids = arrow_schema_no_ids.field(1)

assert not id_field_no_ids.metadata
assert not name_field_no_ids.metadata
assert ORC_FIELD_ID_KEY not in id_field_no_ids.metadata
assert ORC_FIELD_ID_KEY not in name_field_no_ids.metadata
assert PYARROW_PARQUET_FIELD_ID_KEY not in id_field_no_ids.metadata
assert PYARROW_PARQUET_FIELD_ID_KEY not in name_field_no_ids.metadata


def test_orc_schema_conversion_with_required_attribute() -> None:
"""
Test that schema_to_pyarrow correctly adds ORC iceberg.required attribute.
To run just this test:
pytest tests/io/test_pyarrow.py -k test_orc_schema_conversion_with_required_attribute
"""
from pyiceberg.io.pyarrow import ORC_FIELD_REQUIRED_KEY, schema_to_pyarrow
from pyiceberg.manifest import FileFormat
from pyiceberg.schema import Schema
from pyiceberg.types import IntegerType, StringType

# Define schema
schema = Schema(
NestedField(1, "id", IntegerType(), required=True),
NestedField(2, "name", StringType(), required=False),
)

# Test 1: Specify Parquet format
arrow_schema_default = schema_to_pyarrow(schema, file_format=FileFormat.PARQUET)

id_field = arrow_schema_default.field(0)
name_field = arrow_schema_default.field(1)

assert ORC_FIELD_REQUIRED_KEY not in id_field.metadata
assert ORC_FIELD_REQUIRED_KEY not in name_field.metadata

# Test 2: Specify ORC format
arrow_schema_orc = schema_to_pyarrow(schema, file_format=FileFormat.ORC)

id_field_orc = arrow_schema_orc.field(0)
name_field_orc = arrow_schema_orc.field(1)

assert id_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"true"
assert name_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"false"


def test_orc_batching_behavior_documentation(tmp_path: Path) -> None:
Expand Down