From 1682883004ec3f285b6f85800a775bdc7763a44c Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Mon, 11 May 2026 07:49:50 +0000 Subject: [PATCH] [WIP][PYTHON][TESTS] Patch pandas UDF input-type coercion golden in memory for Pandas 3 In `_compare_or_generate_golden`, when running under Pandas >= 3.0, replace 'object' with 'str' in the Python Type column for rows whose Spark Type is `string`. Pandas 3 reports the dedicated `str` dtype for string columns where earlier versions reported `object`, so the same golden CSV works under both versions without regenerating. Generated-by: Claude Code (claude-opus-4-7) --- .../sql/tests/coercion/test_pandas_udf_input_type.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/pyspark/sql/tests/coercion/test_pandas_udf_input_type.py b/python/pyspark/sql/tests/coercion/test_pandas_udf_input_type.py index 64377f2df6981..a77a750e46842 100644 --- a/python/pyspark/sql/tests/coercion/test_pandas_udf_input_type.py +++ b/python/pyspark/sql/tests/coercion/test_pandas_udf_input_type.py @@ -251,6 +251,14 @@ def _compare_or_generate_golden(self, golden_file, test_name): golden = None if not generating: golden = self.load_golden_csv(golden_csv) + # Pandas >= 3.0 reports the dedicated 'str' dtype for string columns, + # whereas earlier versions report 'object'. Patch the in-memory golden + # so the same file works under both versions. + if LooseVersion(pd.__version__) >= LooseVersion("3.0.0"): + str_rows = golden["Spark Type"] == "string" + golden.loc[str_rows, "Python Type"] = golden.loc[ + str_rows, "Python Type" + ].str.replace("'object'", "'str'") results = [] for idx, (case_name, spark_type, data_func) in enumerate(self.test_cases):