From d6e484d4ce6ff2b797c6da59cfd180f4a9559918 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 2 Apr 2026 04:10:24 +0000 Subject: [PATCH] [PYTHON] Fix PyArrow type inference test for Pandas 3 timestamp resolution Pandas 3 changed the default datetime resolution from nanoseconds to microseconds. Update `test_pandas_series_numpy_backed` to use the correct resolution based on the Pandas version. Co-authored-by: Isaac --- .../test_pyarrow_array_type_inference.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_type_inference.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_type_inference.py index 7e60f552e9c7a..169ddd2c55f4d 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_type_inference.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_type_inference.py @@ -299,6 +299,8 @@ def test_pandas_series_numpy_backed(self): # pandas >= 3 infers large_string instead of string for object-dtype string Series string_type = pa.large_string() if LooseVersion(pd.__version__) >= "3.0.0" else pa.string() + # pandas >= 3 defaults to microsecond resolution instead of nanosecond + ts_unit = "us" if LooseVersion(pd.__version__) >= "3.0.0" else "ns" sg = ZoneInfo("Asia/Singapore") la = "America/Los_Angeles" @@ -324,17 +326,17 @@ def test_pandas_series_numpy_backed(self): (pd.Series([True, False, True]), pa.bool_()), # Temporal (pd.Series([date1, date2]), pa.date32()), - (pd.Series(pd.to_datetime(["2024-01-01", "2024-01-02"])), pa.timestamp("ns")), - (pd.Series([pd.Timestamp("1970-01-01")]), pa.timestamp("ns")), - (pd.Series([pd.Timestamp.min]), pa.timestamp("ns")), - (pd.Series([pd.Timestamp.max]), pa.timestamp("ns")), - (pd.Series(pd.to_timedelta(["1 day", "2 hours"])), pa.duration("ns")), - (pd.Series([pd.Timedelta(0)]), pa.duration("ns")), - (pd.Series([pd.Timedelta.min]), pa.duration("ns")), - (pd.Series([pd.Timedelta.max]), pa.duration("ns")), + (pd.Series(pd.to_datetime(["2024-01-01", "2024-01-02"])), pa.timestamp(ts_unit)), + (pd.Series([pd.Timestamp("1970-01-01")]), pa.timestamp(ts_unit)), + (pd.Series([pd.Timestamp.min]), pa.timestamp(ts_unit)), + (pd.Series([pd.Timestamp.max]), pa.timestamp(ts_unit)), + (pd.Series(pd.to_timedelta(["1 day", "2 hours"])), pa.duration(ts_unit)), + (pd.Series([pd.Timedelta(0)]), pa.duration(ts_unit)), + (pd.Series([pd.Timedelta.min]), pa.duration(ts_unit)), + (pd.Series([pd.Timedelta.max]), pa.duration(ts_unit)), # Timezone-aware - (pd.Series([dt1_sg, dt2_sg]), pa.timestamp("ns", tz="Asia/Singapore")), - (pd.Series([ts1_la, ts2_la]), pa.timestamp("ns", tz=la)), + (pd.Series([dt1_sg, dt2_sg]), pa.timestamp(ts_unit, tz="Asia/Singapore")), + (pd.Series([ts1_la, ts2_la]), pa.timestamp(ts_unit, tz=la)), # Binary (pd.Series([b"hello", b"world"]), pa.binary()), # Nested