From afc19f6478f13848920b4279a4ae31a2bc5cd3ba Mon Sep 17 00:00:00 2001
From: Takuya Ueshin <ueshin@databricks.com>
Date: Wed, 18 Mar 2026 14:53:36 -0700
Subject: [PATCH] Handle pandas 3 null string conversion in describe() for
 empty timestamp frames

---
 python/pyspark/pandas/frame.py                | 13 +++++-
 .../pandas/tests/computation/test_describe.py | 43 +++++++++++--------
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index b50bc726cd305..06017473e845e 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -10162,7 +10162,18 @@ def describe(self, percentiles: Optional[List[float]] = None) -> "DataFrame":
             # For timestamp type columns, we should cast the column type to string.
             for key, spark_data_type in zip(column_name_stats_kv, spark_data_types):
                 if isinstance(spark_data_type, (TimestampType, TimestampNTZType)):
-                    column_name_stats_kv[key] = [str(value) for value in column_name_stats_kv[key]]
+                    if LooseVersion(pd.__version__) < "3.0.0":
+                        # In pandas 2, use str(value) for all values, including None
+                        column_name_stats_kv[key] = [
+                            str(value) for value in column_name_stats_kv[key]
+                        ]
+                    else:
+                        # In pandas 3, preserve None to match empty timestamp describe() results
+                        # after string conversion in pandas-based expectations.
+                        column_name_stats_kv[key] = [
+                            str(value) if value is not None else None
+                            for value in column_name_stats_kv[key]
+                        ]
 
             result: DataFrame = DataFrame(  # type: ignore[no-redef]
                 data=column_name_stats_kv,
diff --git a/python/pyspark/pandas/tests/computation/test_describe.py b/python/pyspark/pandas/tests/computation/test_describe.py
index ad04f72380168..3dbff439a6647 100644
--- a/python/pyspark/pandas/tests/computation/test_describe.py
+++ b/python/pyspark/pandas/tests/computation/test_describe.py
@@ -19,6 +19,7 @@
 import numpy as np
 import pandas as pd
 
+from pyspark.loose_version import LooseVersion
 from pyspark import pandas as ps
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils
@@ -194,25 +195,29 @@ def test_describe_empty(self):
             }
         )
         pdf = psdf._to_pandas()
-        # For timestamp type, we should convert NaT to None in pandas result
-        # since pandas API on Spark doesn't support the NaT for object type.
-        pdf_result = pdf[pdf.a != pdf.a].describe()
-        self.assert_eq(
-            psdf[psdf.a != psdf.a].describe(),
-            pdf_result.where(pdf_result.notnull(), None).astype(str),
-        )
+        if LooseVersion(pd.__version__) < "3.0.0":
+            # For timestamp type, we should convert NaT to None in pandas result
+            # since pandas API on Spark doesn't support the NaT for object type.
+            pdf_result = pdf[pdf.a != pdf.a].describe()
+            pdf_result = pdf_result.where(pdf_result.notnull(), None).astype(str)
+        else:
+            # In pandas 3.0.0+, empty timestamp stats become missing values after astype(str),
+            # and pandas API on Spark handles timestamp type as string type accordingly.
+            pdf_result = pdf[pdf.a != pdf.a].describe().astype(str)
+        self.assert_eq(psdf[psdf.a != psdf.a].describe(), pdf_result)
 
         # Explicit empty DataFrame numeric & timestamp
         psdf = ps.DataFrame(
             {"a": [1, 2, 3], "b": [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(1)]}
         )
         pdf = psdf._to_pandas()
-        pdf_result = pdf[pdf.a != pdf.a].describe()
-        pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(), None).astype(str)
-        self.assert_eq(
-            psdf[psdf.a != psdf.a].describe(),
-            pdf_result,
-        )
+        if LooseVersion(pd.__version__) < "3.0.0":
+            pdf_result = pdf[pdf.a != pdf.a].describe()
+            pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(), None).astype(str)
+        else:
+            pdf_result = pdf[pdf.a != pdf.a].describe()
+            pdf_result.b = pdf_result.b.astype(str)
+        self.assert_eq(psdf[psdf.a != psdf.a].describe(), pdf_result)
 
         # Explicit empty DataFrame numeric & string
         psdf = ps.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
@@ -227,11 +232,13 @@ def test_describe_empty(self):
             {"a": ["a", "b", "c"], "b": [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(1)]}
         )
         pdf = psdf._to_pandas()
-        pdf_result = pdf[pdf.a != pdf.a].describe()
-        self.assert_eq(
-            psdf[psdf.a != psdf.a].describe(),
-            pdf_result.where(pdf_result.notnull(), None).astype(str),
-        )
+        if LooseVersion(pd.__version__) < "3.0.0":
+            pdf_result = pdf[pdf.a != pdf.a].describe()
+            pdf_result = pdf_result.where(pdf_result.notnull(), None).astype(str)
+        else:
+            pdf_result = pdf[pdf.a != pdf.a].describe()
+            pdf_result.b = pdf_result.b.astype(str)
+        self.assert_eq(psdf[psdf.a != psdf.a].describe(), pdf_result)
 
 
 class FrameDescribeTests(