apache · Yicong-Huang · Mar 26, 2026 · Apr 2, 2026
diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json
@@ -174,8 +174,8 @@
       "`<func_name>` does not allow a Column in a list."
     ]
   },
-  "CONFLICTING_PIPELINE_REFRESH_OPTIONS" : {
-    "message" : [
+  "CONFLICTING_PIPELINE_REFRESH_OPTIONS": {
+    "message": [
       "--full-refresh-all option conflicts with <conflicting_option>",
       "The --full-refresh-all option performs a full refresh of all datasets, ",
       "so specifying individual datasets with <conflicting_option> is not allowed."
@@ -379,6 +379,11 @@
       "<arg_name> index out of range, got '<index>'."
     ]
   },
+  "INPUT_NOT_FULLY_CONSUMED": {
+    "message": [
+      "The input iterator must be fully consumed."
+    ]
+  },
   "INVALID_ARROW_UDTF_RETURN_TYPE": {
     "message": [
       "The return type of the arrow-optimized Python UDTF should be of type 'pandas.DataFrame', but the '<func>' method returned a value of type <return_type> with value: <value>."
@@ -869,6 +874,11 @@
       "<feature> is only supported with Spark Connect; however, the current Spark session does not use Spark Connect."
     ]
   },
+  "OUTPUT_EXCEEDS_INPUT_ROWS": {
+    "message": [
+      "The number of output rows must not exceed the number of input rows."
+    ]
+  },
   "PACKAGE_NOT_INSTALLED": {
     "message": [
       "<package_name> >= <minimum_version> must be installed; however, it was not found."
@@ -881,11 +891,6 @@
       "Alternatively set a pandas-on-spark option 'compute.fail_on_ansi_mode' to `False` to force it to work, although it can cause unexpected behavior."
     ]
   },
-  "PANDAS_UDF_OUTPUT_EXCEEDS_INPUT_ROWS": {
-    "message": [
-      "The Pandas SCALAR_ITER UDF outputs more rows than input rows."
-    ]
-  },
   "PIPELINE_SPEC_DICT_KEY_NOT_STRING": {
     "message": [
       "For pipeline spec field `<field_name>`, key should be a string, got <key_type>."
@@ -982,29 +987,24 @@
       "<error_type> on the server but responses were already received from it."
     ]
   },
-  "RESULT_COLUMNS_MISMATCH_FOR_ARROW_UDF": {
-    "message": [
-      "Column names of the returned pyarrow.Table do not match specified schema.<missing><extra>"
-    ]
-  },
   "RESULT_COLUMNS_MISMATCH_FOR_ARROW_UDTF": {
     "message": [
       "Column names of the returned pyarrow.Table or pyarrow.RecordBatch do not match specified schema. Expected: <expected> Actual: <actual>"
     ]
   },
-  "RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF": {
+  "RESULT_COLUMN_NAMES_MISMATCH": {
     "message": [
-      "Column names of the returned pandas.DataFrame do not match specified schema.<missing><extra>"
+      "Column names of the returned data do not match specified schema.<missing><extra>"
     ]
   },
-  "RESULT_LENGTH_MISMATCH_FOR_PANDAS_UDF": {
+  "RESULT_COLUMN_SCHEMA_MISMATCH": {
     "message": [
-      "Number of columns of the returned pandas.DataFrame doesn't match specified schema. Expected: <expected> Actual: <actual>"
+      "Number of columns of the returned data doesn't match specified schema. Expected: <expected> Actual: <actual>"
     ]
   },
-  "RESULT_LENGTH_MISMATCH_FOR_SCALAR_ITER_PANDAS_UDF": {
+  "RESULT_ROWS_MISMATCH": {
     "message": [
-      "The length of output in Scalar iterator pandas UDF should be the same with the input's; however, the length of output was <output_length> and the length of input was <input_length>."
+      "The number of output rows (<output_length>) must match the number of input rows (<input_length>)."
     ]
   },
   "RESULT_TYPE_MISMATCH_FOR_ARROW_UDF": {
@@ -1037,64 +1037,64 @@
       "Session mutation <method> is not allowed in declarative pipelines."
     ],
     "sub_class": {
-      "SET_RUNTIME_CONF": {
+      "CREATE_GLOBAL_TEMP_VIEW": {
         "message": [
-          "Instead set configuration via the pipeline spec or use the 'spark_conf' argument in various decorators."
+          "Instead use the @temporary_view decorator to define temporary views."
         ]
       },
-      "SET_CURRENT_CATALOG": {
+      "CREATE_OR_REPLACE_GLOBAL_TEMP_VIEW": {
         "message": [
-          "Instead set catalog via the pipeline spec or the 'name' argument on the dataset decorators."
+          "Instead use the @temporary_view decorator to define temporary views."
         ]
       },
-      "SET_CURRENT_DATABASE": {
+      "CREATE_OR_REPLACE_TEMP_VIEW": {
         "message": [
-          "Instead set database via the pipeline spec or the 'name' argument on the dataset decorators."
+          "Instead use the @temporary_view decorator to define temporary views."
         ]
       },
-      "DROP_TEMP_VIEW": {
+      "CREATE_TEMP_VIEW": {
         "message": [
-          "Instead remove the temporary view definition directly."
+          "Instead use the @temporary_view decorator to define temporary views."
         ]
       },
       "DROP_GLOBAL_TEMP_VIEW": {
         "message": [
           "Instead remove the temporary view definition directly."
         ]
       },
-      "CREATE_TEMP_VIEW": {
+      "DROP_TEMP_VIEW": {
         "message": [
-          "Instead use the @temporary_view decorator to define temporary views."
+          "Instead remove the temporary view definition directly."
         ]
       },
-      "CREATE_OR_REPLACE_TEMP_VIEW": {
+      "REGISTER_JAVA_UDAF": {
         "message": [
-          "Instead use the @temporary_view decorator to define temporary views."
+          ""
         ]
       },
-      "CREATE_GLOBAL_TEMP_VIEW": {
+      "REGISTER_JAVA_UDF": {
         "message": [
-          "Instead use the @temporary_view decorator to define temporary views."
+          ""
         ]
       },
-      "CREATE_OR_REPLACE_GLOBAL_TEMP_VIEW": {
+      "REGISTER_UDF": {
         "message": [
-          "Instead use the @temporary_view decorator to define temporary views."
+          ""
         ]
       },
-      "REGISTER_UDF": {
+      "SET_CURRENT_CATALOG": {
         "message": [
-          ""
+          "Instead set catalog via the pipeline spec or the 'name' argument on the dataset decorators."
         ]
       },
-      "REGISTER_JAVA_UDF": {
+      "SET_CURRENT_DATABASE": {
         "message": [
-          ""
+          "Instead set database via the pipeline spec or the 'name' argument on the dataset decorators."
         ]
       },
-      "REGISTER_JAVA_UDAF": {
+      "SET_RUNTIME_CONF": {
         "message": [
-          ""
+          "Instead set configuration via the pipeline spec or use the 'spark_conf' argument in various decorators."
         ]
       }
     }
@@ -1139,33 +1139,28 @@
       "Caught StopIteration thrown from user's code; failing the task: <exc>"
     ]
   },
-  "STOP_ITERATION_OCCURRED_FROM_SCALAR_ITER_PANDAS_UDF": {
-    "message": [
-      "pandas iterator UDF should exhaust the input iterator."
-    ]
-  },
   "STREAMING_CONNECT_SERIALIZATION_ERROR": {
     "message": [
       "Cannot serialize the function `<name>`. If you accessed the Spark session, or a DataFrame defined outside of the function, or any object that contains a Spark session, please be aware that they are not allowed in Spark Connect. For `foreachBatch`, please access the Spark session using `df.sparkSession`, where `df` is the first parameter in your `foreachBatch` function. For `StreamingQueryListener`, please access the Spark session using `self.spark`. For details please check out the PySpark doc for `foreachBatch` and `StreamingQueryListener`."
     ]
   },
-  "ST_INVALID_ALGORITHM_VALUE" : {
-    "message" : [
+  "ST_INVALID_ALGORITHM_VALUE": {
+    "message": [
       "Invalid or unsupported edge interpolation algorithm value: '<alg>'."
     ],
-    "sqlState" : "22023"
+    "sqlState": "22023"
   },
-  "ST_INVALID_CRS_VALUE" : {
-    "message" : [
+  "ST_INVALID_CRS_VALUE": {
+    "message": [
       "Invalid or unsupported CRS (coordinate reference system) value: '<crs>'."
     ],
-    "sqlState" : "22023"
+    "sqlState": "22023"
   },
-  "ST_INVALID_SRID_VALUE" : {
-    "message" : [
+  "ST_INVALID_SRID_VALUE": {
+    "message": [
       "Invalid or unsupported SRID (spatial reference identifier) value: <srid>."
     ],
-    "sqlState" : "22023"
+    "sqlState": "22023"
   },
   "TEST_CLASS_NOT_COMPILED": {
     "message": [

diff --git a/python/pyspark/sql/tests/arrow/test_arrow_cogrouped_map.py b/python/pyspark/sql/tests/arrow/test_arrow_cogrouped_map.py
@@ -195,8 +195,8 @@ def stats(key, left, right):
         with self.quiet():
             with self.assertRaisesRegex(
                 PythonException,
-                "Column names of the returned pyarrow.Table do not match specified schema. "
-                "Missing: m. Unexpected: v, v2.\n",
+                "Column names of the returned data do not match specified schema. "
+                "Missing: m. Unexpected: v, v2.",
             ):
                 # stats returns three columns while here we set schema with two columns
                 self.cogrouped.applyInArrow(stats, schema="id long, m double").collect()
@@ -231,8 +231,7 @@ def odd_means(key, left, _):
         with self.quiet():
             with self.assertRaisesRegex(
                 PythonException,
-                "Column names of the returned pyarrow.Table do not match specified schema. "
-                "Missing: m.\n",
+                "Column names of the returned data do not match specified schema. Missing: m.",
             ):
                 # stats returns one column for even keys while here we set schema with two columns
                 self.cogrouped.applyInArrow(odd_means, schema="id long, m double").collect()

diff --git a/python/pyspark/sql/tests/arrow/test_arrow_grouped_map.py b/python/pyspark/sql/tests/arrow/test_arrow_grouped_map.py
@@ -223,8 +223,8 @@ def stats(key, table):
             for func_variation in function_variations(stats):
                 with self.assertRaisesRegex(
                     PythonException,
-                    "Column names of the returned pyarrow.Table do not match specified schema. "
-                    "Missing: m. Unexpected: v, v2.\n",
+                    "Column names of the returned data do not match specified schema. "
+                    "Missing: m. Unexpected: v, v2.",
                 ):
                     # stats returns three columns while here we set schema with two columns
                     df.groupby("id").applyInArrow(
@@ -264,8 +264,7 @@ def odd_means(key, table):
         with self.quiet():
             with self.assertRaisesRegex(
                 PythonException,
-                "Column names of the returned pyarrow.Table do not match specified schema. "
-                "Missing: m.\n",
+                "Column names of the returned data do not match specified schema. Missing: m.",
             ):
                 # stats returns one column for even keys while here we set schema with two columns
                 df.groupby("id").applyInArrow(odd_means, schema="id long, m double").collect()

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py
@@ -208,8 +208,8 @@ def merge_pandas(lft, rgt):
         self._test_merge_error(
             fn=merge_pandas,
             errorClass=PythonException,
-            error_message_regex="Column names of the returned pandas.DataFrame "
-            "do not match specified schema. Unexpected: add, more.\n",
+            error_message_regex="Column names of the returned data "
+            "do not match specified schema. Unexpected: add, more.",
         )
 
     def test_apply_in_pandas_returning_no_column_names_and_wrong_amount(self):
@@ -229,8 +229,8 @@ def merge_pandas(lft, rgt):
         self._test_merge_error(
             fn=merge_pandas,
             errorClass=PythonException,
-            error_message_regex="Number of columns of the returned pandas.DataFrame "
-            "doesn't match specified schema. Expected: 4 Actual: 6\n",
+            error_message_regex="Number of columns of the returned data "
+            "doesn't match specified schema. Expected: 4 Actual: 6",
         )
 
     def test_apply_in_pandas_returning_empty_dataframe(self):

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py
@@ -302,8 +302,8 @@ def test_apply_in_pandas_returning_wrong_column_names(self):
     def check_apply_in_pandas_returning_wrong_column_names(self):
         with self.assertRaisesRegex(
             PythonException,
-            "Column names of the returned pandas.DataFrame do not match specified schema. "
-            "Missing: mean. Unexpected: median, std.\n",
+            "Column names of the returned data do not match specified schema. "
+            "Missing: mean. Unexpected: median, std.",
         ):
             self._test_apply_in_pandas(
                 lambda key, pdf: pd.DataFrame(
@@ -318,8 +318,8 @@ def test_apply_in_pandas_returning_no_column_names_and_wrong_amount(self):
     def check_apply_in_pandas_returning_no_column_names_and_wrong_amount(self):
         with self.assertRaisesRegex(
             PythonException,
-            "Number of columns of the returned pandas.DataFrame doesn't match "
-            "specified schema. Expected: 2 Actual: 3\n",
+            "Number of columns of the returned data doesn't match "
+            "specified schema. Expected: 2 Actual: 3",
         ):
             self._test_apply_in_pandas(
                 lambda key, pdf: pd.DataFrame([key + (pdf.v.mean(), pdf.v.std())])
@@ -664,8 +664,8 @@ def invalid_positional_types(pdf):
         with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}):
             with self.assertRaisesRegex(
                 PythonException,
-                "Column names of the returned pandas.DataFrame do not match "
-                "specified schema. Missing: id. Unexpected: iid.\n",
+                "Column names of the returned data do not match "
+                "specified schema. Missing: id. Unexpected: iid.",
             ):
                 grouped_df.apply(column_name_typo).collect()
             with self.assertRaisesRegex(Exception, "[D|d]ecimal.*got.*date"):

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_map.py b/python/pyspark/sql/tests/pandas/test_pandas_map.py
@@ -200,9 +200,9 @@ def dataframes_with_other_column_names(iterator):
 
         with self.assertRaisesRegex(
             PythonException,
-            "PySparkRuntimeError: \\[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF\\] "
-            "Column names of the returned pandas.DataFrame do not match "
-            "specified schema. Missing: id. Unexpected: iid.\n",
+            "PySparkRuntimeError: \\[RESULT_COLUMN_NAMES_MISMATCH\\] "
+            "Column names of the returned data do not match "
+            "specified schema. Missing: id. Unexpected: iid.",
         ):
             (
                 self.spark.range(10, numPartitions=3)
@@ -222,9 +222,9 @@ def dataframes_with_other_column_names(iterator):
 
         with self.assertRaisesRegex(
             PythonException,
-            "PySparkRuntimeError: \\[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF\\] "
-            "Column names of the returned pandas.DataFrame do not match "
-            "specified schema. Missing: id2.\n",
+            "PySparkRuntimeError: \\[RESULT_COLUMN_NAMES_MISMATCH\\] "
+            "Column names of the returned data do not match "
+            "specified schema. Missing: id2.",
         ):
             (
                 self.spark.range(10, numPartitions=3)
@@ -243,18 +243,18 @@ def check_dataframes_with_less_columns(self):
 
         with self.assertRaisesRegex(
             PythonException,
-            "PySparkRuntimeError: \\[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF\\] "
-            "Column names of the returned pandas.DataFrame do not match "
-            "specified schema. Missing: id2.\n",
+            "PySparkRuntimeError: \\[RESULT_COLUMN_NAMES_MISMATCH\\] "
+            "Column names of the returned data do not match "
+            "specified schema. Missing: id2.",
         ):
             f = self.identity_dataframes_iter("id", "value")
             (df.mapInPandas(f, "id int, id2 long, value int").collect())
 
         with self.assertRaisesRegex(
             PythonException,
-            "PySparkRuntimeError: \\[RESULT_LENGTH_MISMATCH_FOR_PANDAS_UDF\\] "
-            "Number of columns of the returned pandas.DataFrame doesn't match "
-            "specified schema. Expected: 3 Actual: 2\n",
+            "PySparkRuntimeError: \\[RESULT_COLUMN_SCHEMA_MISMATCH\\] "
+            "Number of columns of the returned data doesn't match "
+            "specified schema. Expected: 3 Actual: 2",
         ):
             f = self.identity_dataframes_wo_column_names_iter("id", "value")
             (df.mapInPandas(f, "id int, id2 long, value int").collect())
@@ -361,9 +361,9 @@ def test_empty_dataframes_with_less_columns(self):
     def check_empty_dataframes_with_less_columns(self):
         with self.assertRaisesRegex(
             PythonException,
-            "PySparkRuntimeError: \\[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF\\] "
-            "Column names of the returned pandas.DataFrame do not match "
-            "specified schema. Missing: value.\n",
+            "PySparkRuntimeError: \\[RESULT_COLUMN_NAMES_MISMATCH\\] "
+            "Column names of the returned data do not match "
+            "specified schema. Missing: value.",
         ):
             f = self.dataframes_and_empty_dataframe_iter("id")
             (
@@ -390,9 +390,9 @@ def empty_dataframes_with_other_columns(iterator):
 
         with self.assertRaisesRegex(
             PythonException,
-            "PySparkRuntimeError: \\[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF\\] "
-            "Column names of the returned pandas.DataFrame do not match "
-            "specified schema. Missing: id. Unexpected: iid.\n",
+            "PySparkRuntimeError: \\[RESULT_COLUMN_NAMES_MISMATCH\\] "
+            "Column names of the returned data do not match "
+            "specified schema. Missing: id. Unexpected: iid.",
         ):
             (
                 self.spark.range(10, numPartitions=3)