Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion python/pyspark/pandas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10162,7 +10162,18 @@ def describe(self, percentiles: Optional[List[float]] = None) -> "DataFrame":
# For timestamp type columns, we should cast the column type to string.
for key, spark_data_type in zip(column_name_stats_kv, spark_data_types):
if isinstance(spark_data_type, (TimestampType, TimestampNTZType)):
column_name_stats_kv[key] = [str(value) for value in column_name_stats_kv[key]]
if LooseVersion(pd.__version__) < "3.0.0":
# In pandas 2, use str(value) for all values, including None
column_name_stats_kv[key] = [
str(value) for value in column_name_stats_kv[key]
]
else:
# In pandas 3, preserve None to match empty timestamp describe() results
# after string conversion in pandas-based expectations.
column_name_stats_kv[key] = [
str(value) if value is not None else None
for value in column_name_stats_kv[key]
]

result: DataFrame = DataFrame( # type: ignore[no-redef]
data=column_name_stats_kv,
Expand Down
43 changes: 25 additions & 18 deletions python/pyspark/pandas/tests/computation/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import numpy as np
import pandas as pd

from pyspark.loose_version import LooseVersion
from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
Expand Down Expand Up @@ -194,25 +195,29 @@ def test_describe_empty(self):
}
)
pdf = psdf._to_pandas()
# For timestamp type, we should convert NaT to None in pandas result
# since pandas API on Spark doesn't support the NaT for object type.
pdf_result = pdf[pdf.a != pdf.a].describe()
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
pdf_result.where(pdf_result.notnull(), None).astype(str),
)
if LooseVersion(pd.__version__) < "3.0.0":
# For timestamp type, we should convert NaT to None in pandas result
# since pandas API on Spark doesn't support the NaT for object type.
pdf_result = pdf[pdf.a != pdf.a].describe()
pdf_result = pdf_result.where(pdf_result.notnull(), None).astype(str)
else:
# In pandas 3.0.0+, empty timestamp stats become missing values after astype(str),
# and pandas API on Spark handles timestamp type as string type accordingly.
pdf_result = pdf[pdf.a != pdf.a].describe().astype(str)
self.assert_eq(psdf[psdf.a != psdf.a].describe(), pdf_result)

# Explicit empty DataFrame numeric & timestamp
psdf = ps.DataFrame(
{"a": [1, 2, 3], "b": [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(1)]}
)
pdf = psdf._to_pandas()
pdf_result = pdf[pdf.a != pdf.a].describe()
pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(), None).astype(str)
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
pdf_result,
)
if LooseVersion(pd.__version__) < "3.0.0":
pdf_result = pdf[pdf.a != pdf.a].describe()
pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(), None).astype(str)
else:
pdf_result = pdf[pdf.a != pdf.a].describe()
pdf_result.b = pdf_result.b.astype(str)
self.assert_eq(psdf[psdf.a != psdf.a].describe(), pdf_result)

# Explicit empty DataFrame numeric & string
psdf = ps.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
Expand All @@ -227,11 +232,13 @@ def test_describe_empty(self):
{"a": ["a", "b", "c"], "b": [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(1)]}
)
pdf = psdf._to_pandas()
pdf_result = pdf[pdf.a != pdf.a].describe()
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
pdf_result.where(pdf_result.notnull(), None).astype(str),
)
if LooseVersion(pd.__version__) < "3.0.0":
pdf_result = pdf[pdf.a != pdf.a].describe()
pdf_result = pdf_result.where(pdf_result.notnull(), None).astype(str)
else:
pdf_result = pdf[pdf.a != pdf.a].describe()
pdf_result.b = pdf_result.b.astype(str)
self.assert_eq(psdf[psdf.a != psdf.a].describe(), pdf_result)


class FrameDescribeTests(
Expand Down