From afc19f6478f13848920b4279a4ae31a2bc5cd3ba Mon Sep 17 00:00:00 2001 From: Takuya Ueshin Date: Wed, 18 Mar 2026 14:53:36 -0700 Subject: [PATCH] Handle pandas 3 null string conversion in describe() for empty timestamp frames --- python/pyspark/pandas/frame.py | 13 +++++- .../pandas/tests/computation/test_describe.py | 43 +++++++++++-------- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index b50bc726cd305..06017473e845e 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -10162,7 +10162,18 @@ def describe(self, percentiles: Optional[List[float]] = None) -> "DataFrame": # For timestamp type columns, we should cast the column type to string. for key, spark_data_type in zip(column_name_stats_kv, spark_data_types): if isinstance(spark_data_type, (TimestampType, TimestampNTZType)): - column_name_stats_kv[key] = [str(value) for value in column_name_stats_kv[key]] + if LooseVersion(pd.__version__) < "3.0.0": + # In pandas 2, use str(value) for all values, including None + column_name_stats_kv[key] = [ + str(value) for value in column_name_stats_kv[key] + ] + else: + # In pandas 3, preserve None to match empty timestamp describe() results + # after string conversion in pandas-based expectations. + column_name_stats_kv[key] = [ + str(value) if value is not None else None + for value in column_name_stats_kv[key] + ] result: DataFrame = DataFrame( # type: ignore[no-redef] data=column_name_stats_kv, diff --git a/python/pyspark/pandas/tests/computation/test_describe.py b/python/pyspark/pandas/tests/computation/test_describe.py index ad04f72380168..3dbff439a6647 100644 --- a/python/pyspark/pandas/tests/computation/test_describe.py +++ b/python/pyspark/pandas/tests/computation/test_describe.py @@ -19,6 +19,7 @@ import numpy as np import pandas as pd +from pyspark.loose_version import LooseVersion from pyspark import pandas as ps from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.sqlutils import SQLTestUtils @@ -194,25 +195,29 @@ def test_describe_empty(self): } ) pdf = psdf._to_pandas() - # For timestamp type, we should convert NaT to None in pandas result - # since pandas API on Spark doesn't support the NaT for object type. - pdf_result = pdf[pdf.a != pdf.a].describe() - self.assert_eq( - psdf[psdf.a != psdf.a].describe(), - pdf_result.where(pdf_result.notnull(), None).astype(str), - ) + if LooseVersion(pd.__version__) < "3.0.0": + # For timestamp type, we should convert NaT to None in pandas result + # since pandas API on Spark doesn't support the NaT for object type. + pdf_result = pdf[pdf.a != pdf.a].describe() + pdf_result = pdf_result.where(pdf_result.notnull(), None).astype(str) + else: + # In pandas 3.0.0+, empty timestamp stats become missing values after astype(str), + # and pandas API on Spark handles timestamp type as string type accordingly. + pdf_result = pdf[pdf.a != pdf.a].describe().astype(str) + self.assert_eq(psdf[psdf.a != psdf.a].describe(), pdf_result) # Explicit empty DataFrame numeric & timestamp psdf = ps.DataFrame( {"a": [1, 2, 3], "b": [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(1)]} ) pdf = psdf._to_pandas() - pdf_result = pdf[pdf.a != pdf.a].describe() - pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(), None).astype(str) - self.assert_eq( - psdf[psdf.a != psdf.a].describe(), - pdf_result, - ) + if LooseVersion(pd.__version__) < "3.0.0": + pdf_result = pdf[pdf.a != pdf.a].describe() + pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(), None).astype(str) + else: + pdf_result = pdf[pdf.a != pdf.a].describe() + pdf_result.b = pdf_result.b.astype(str) + self.assert_eq(psdf[psdf.a != psdf.a].describe(), pdf_result) # Explicit empty DataFrame numeric & string psdf = ps.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) @@ -227,11 +232,13 @@ def test_describe_empty(self): {"a": ["a", "b", "c"], "b": [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(1)]} ) pdf = psdf._to_pandas() - pdf_result = pdf[pdf.a != pdf.a].describe() - self.assert_eq( - psdf[psdf.a != psdf.a].describe(), - pdf_result.where(pdf_result.notnull(), None).astype(str), - ) + if LooseVersion(pd.__version__) < "3.0.0": + pdf_result = pdf[pdf.a != pdf.a].describe() + pdf_result = pdf_result.where(pdf_result.notnull(), None).astype(str) + else: + pdf_result = pdf[pdf.a != pdf.a].describe() + pdf_result.b = pdf_result.b.astype(str) + self.assert_eq(psdf[psdf.a != psdf.a].describe(), pdf_result) class FrameDescribeTests(