diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 46d7fe6b8b..93b58940cd 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1839,16 +1839,16 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: if field.field_type.is_primitive: if (target_type := schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)) != values.type: if field.field_type == TimestampType(): - # Downcasting of nanoseconds to microseconds if ( pa.types.is_timestamp(target_type) and not target_type.tz and pa.types.is_timestamp(values.type) - and not values.type.tz + and (values.type.tz in UTC_ALIASES or values.type.tz is None) ): + # Downcasting of nanoseconds to microseconds if target_type.unit == "us" and values.type.unit == "ns" and self._downcast_ns_timestamp_to_us: return values.cast(target_type, safe=False) - elif target_type.unit == "us" and values.type.unit in {"s", "ms"}: + elif target_type.unit == "us" and values.type.unit in {"s", "ms", "us"}: return values.cast(target_type) raise ValueError(f"Unsupported schema projection from {values.type} to {target_type}") elif field.field_type == TimestamptzType(): @@ -1858,6 +1858,7 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: and pa.types.is_timestamp(values.type) and (values.type.tz in UTC_ALIASES or values.type.tz is None) ): + # Downcasting of nanoseconds to microseconds if target_type.unit == "us" and values.type.unit == "ns" and self._downcast_ns_timestamp_to_us: return values.cast(target_type, safe=False) elif target_type.unit == "us" and values.type.unit in {"s", "ms", "us"}: diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 3765ea6de6..383c4011fd 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -2684,6 +2684,43 @@ def test__to_requested_schema_timestamp_to_timestamptz_projection() -> None: assert expected.equals(actual_result) +def test__to_requested_schema_timestamptz_to_timestamp_projection() -> None: + # file is written with timestamp with timezone + file_schema = Schema(NestedField(1, "ts_field", TimestamptzType(), required=False)) + batch = pa.record_batch( + [ + pa.array( + [ + datetime(2025, 8, 14, 12, 0, 0, tzinfo=timezone.utc), + datetime(2025, 8, 14, 13, 0, 0, tzinfo=timezone.utc), + ], + type=pa.timestamp("us", tz="UTC"), + ) + ], + names=["ts_field"], + ) + + # table schema expects timestamp without timezone + table_schema = Schema(NestedField(1, "ts_field", TimestampType(), required=False)) + + actual_result = _to_requested_schema(table_schema, file_schema, batch, downcast_ns_timestamp_to_us=True) + expected = pa.record_batch( + [ + pa.array( + [ + datetime(2025, 8, 14, 12, 0, 0), + datetime(2025, 8, 14, 13, 0, 0), + ], + type=pa.timestamp("us"), + ) + ], + names=["ts_field"], + ) + + # expect actual_result to have no timezone + assert expected.equals(actual_result) + + def test__to_requested_schema_timestamps( arrow_table_schema_with_all_timestamp_precisions: pa.Schema, arrow_table_with_all_timestamp_precisions: pa.Table,