From fea073ab9af5ec74edda6434b43d59bf60641111 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 9 Nov 2025 08:01:18 -0800 Subject: [PATCH 1/3] allow arrow timestamptz with UTC to be read as iceberg timestamp --- pyiceberg/io/pyarrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 46d7fe6b8b..b1be113c03 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1844,11 +1844,11 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: pa.types.is_timestamp(target_type) and not target_type.tz and pa.types.is_timestamp(values.type) - and not values.type.tz + and (values.type.tz in UTC_ALIASES or values.type.tz is None) ): if target_type.unit == "us" and values.type.unit == "ns" and self._downcast_ns_timestamp_to_us: return values.cast(target_type, safe=False) - elif target_type.unit == "us" and values.type.unit in {"s", "ms"}: + elif target_type.unit == "us" and values.type.unit in {"s", "ms", "us"}: return values.cast(target_type) raise ValueError(f"Unsupported schema projection from {values.type} to {target_type}") elif field.field_type == TimestamptzType(): From ac94b2967b92a6fffa7225719d53c1d7f669d538 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 9 Nov 2025 08:01:28 -0800 Subject: [PATCH 2/3] nit: comment --- pyiceberg/io/pyarrow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index b1be113c03..93b58940cd 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1839,13 +1839,13 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: if field.field_type.is_primitive: if (target_type := schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)) != values.type: if field.field_type == TimestampType(): - # Downcasting of nanoseconds to microseconds if ( pa.types.is_timestamp(target_type) and not target_type.tz and pa.types.is_timestamp(values.type) and (values.type.tz in UTC_ALIASES or values.type.tz is None) ): + # Downcasting of nanoseconds to microseconds if target_type.unit == "us" and values.type.unit == "ns" and self._downcast_ns_timestamp_to_us: return values.cast(target_type, safe=False) elif target_type.unit == "us" and values.type.unit in {"s", "ms", "us"}: @@ -1858,6 +1858,7 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: and pa.types.is_timestamp(values.type) and (values.type.tz in UTC_ALIASES or values.type.tz is None) ): + # Downcasting of nanoseconds to microseconds if target_type.unit == "us" and values.type.unit == "ns" and self._downcast_ns_timestamp_to_us: return values.cast(target_type, safe=False) elif target_type.unit == "us" and values.type.unit in {"s", "ms", "us"}: From f5ff73ae5409e1c3cd2707bfdd09e4c47c11f7dd Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 9 Nov 2025 08:39:39 -0800 Subject: [PATCH 3/3] add testcase --- tests/io/test_pyarrow.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 3765ea6de6..383c4011fd 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2684,6 +2684,43 @@ def test__to_requested_schema_timestamp_to_timestamptz_projection() -> None: assert expected.equals(actual_result) +def test__to_requested_schema_timestamptz_to_timestamp_projection() -> None: + # file is written with timestamp with timezone + file_schema = Schema(NestedField(1, "ts_field", TimestamptzType(), required=False)) + batch = pa.record_batch( + [ + pa.array( + [ + datetime(2025, 8, 14, 12, 0, 0, tzinfo=timezone.utc), + datetime(2025, 8, 14, 13, 0, 0, tzinfo=timezone.utc), + ], + type=pa.timestamp("us", tz="UTC"), + ) + ], + names=["ts_field"], + ) + + # table schema expects timestamp without timezone + table_schema = Schema(NestedField(1, "ts_field", TimestampType(), required=False)) + + actual_result = _to_requested_schema(table_schema, file_schema, batch, downcast_ns_timestamp_to_us=True) + expected = pa.record_batch( + [ + pa.array( + [ + datetime(2025, 8, 14, 12, 0, 0), + datetime(2025, 8, 14, 13, 0, 0), + ], + type=pa.timestamp("us"), + ) + ], + names=["ts_field"], + ) + + # expect actual_result to have no timezone + assert expected.equals(actual_result) + + def test__to_requested_schema_timestamps( arrow_table_schema_with_all_timestamp_precisions: pa.Schema, arrow_table_with_all_timestamp_precisions: pa.Table,