Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 49 additions & 54 deletions python/pyspark/errors/error-conditions.json
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@
"`<func_name>` does not allow a Column in a list."
]
},
"CONFLICTING_PIPELINE_REFRESH_OPTIONS" : {
"message" : [
"CONFLICTING_PIPELINE_REFRESH_OPTIONS": {
"message": [
"--full-refresh-all option conflicts with <conflicting_option>",
"The --full-refresh-all option performs a full refresh of all datasets, ",
"so specifying individual datasets with <conflicting_option> is not allowed."
Expand Down Expand Up @@ -379,6 +379,11 @@
"<arg_name> index out of range, got '<index>'."
]
},
"INPUT_NOT_FULLY_CONSUMED": {
"message": [
"The input iterator must be fully consumed."
]
},
"INVALID_ARROW_UDTF_RETURN_TYPE": {
"message": [
"The return type of the arrow-optimized Python UDTF should be of type 'pandas.DataFrame', but the '<func>' method returned a value of type <return_type> with value: <value>."
Expand Down Expand Up @@ -869,6 +874,11 @@
"<feature> is only supported with Spark Connect; however, the current Spark session does not use Spark Connect."
]
},
"OUTPUT_EXCEEDS_INPUT_ROWS": {
"message": [
"The number of output rows must not exceed the number of input rows."
]
},
"PACKAGE_NOT_INSTALLED": {
"message": [
"<package_name> >= <minimum_version> must be installed; however, it was not found."
Expand All @@ -881,11 +891,6 @@
"Alternatively set a pandas-on-spark option 'compute.fail_on_ansi_mode' to `False` to force it to work, although it can cause unexpected behavior."
]
},
"PANDAS_UDF_OUTPUT_EXCEEDS_INPUT_ROWS": {
"message": [
"The Pandas SCALAR_ITER UDF outputs more rows than input rows."
]
},
"PIPELINE_SPEC_DICT_KEY_NOT_STRING": {
"message": [
"For pipeline spec field `<field_name>`, key should be a string, got <key_type>."
Expand Down Expand Up @@ -982,29 +987,24 @@
"<error_type> on the server but responses were already received from it."
]
},
"RESULT_COLUMNS_MISMATCH_FOR_ARROW_UDF": {
"message": [
"Column names of the returned pyarrow.Table do not match specified schema.<missing><extra>"
]
},
"RESULT_COLUMNS_MISMATCH_FOR_ARROW_UDTF": {
"message": [
"Column names of the returned pyarrow.Table or pyarrow.RecordBatch do not match specified schema. Expected: <expected> Actual: <actual>"
]
},
"RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF": {
"RESULT_COLUMN_NAMES_MISMATCH": {
"message": [
"Column names of the returned pandas.DataFrame do not match specified schema.<missing><extra>"
"Column names of the returned data do not match specified schema.<missing><extra>"
]
},
"RESULT_LENGTH_MISMATCH_FOR_PANDAS_UDF": {
"RESULT_COLUMN_SCHEMA_MISMATCH": {
"message": [
"Number of columns of the returned pandas.DataFrame doesn't match specified schema. Expected: <expected> Actual: <actual>"
"Number of columns of the returned data doesn't match specified schema. Expected: <expected> Actual: <actual>"
]
},
"RESULT_LENGTH_MISMATCH_FOR_SCALAR_ITER_PANDAS_UDF": {
"RESULT_ROWS_MISMATCH": {
"message": [
"The length of output in Scalar iterator pandas UDF should be the same with the input's; however, the length of output was <output_length> and the length of input was <input_length>."
"The number of output rows (<output_length>) must match the number of input rows (<input_length>)."
]
},
"RESULT_TYPE_MISMATCH_FOR_ARROW_UDF": {
Expand Down Expand Up @@ -1037,64 +1037,64 @@
"Session mutation <method> is not allowed in declarative pipelines."
],
"sub_class": {
"SET_RUNTIME_CONF": {
"CREATE_GLOBAL_TEMP_VIEW": {
"message": [
"Instead set configuration via the pipeline spec or use the 'spark_conf' argument in various decorators."
"Instead use the @temporary_view decorator to define temporary views."
]
},
"SET_CURRENT_CATALOG": {
"CREATE_OR_REPLACE_GLOBAL_TEMP_VIEW": {
"message": [
"Instead set catalog via the pipeline spec or the 'name' argument on the dataset decorators."
"Instead use the @temporary_view decorator to define temporary views."
]
},
"SET_CURRENT_DATABASE": {
"CREATE_OR_REPLACE_TEMP_VIEW": {
"message": [
"Instead set database via the pipeline spec or the 'name' argument on the dataset decorators."
"Instead use the @temporary_view decorator to define temporary views."
]
},
"DROP_TEMP_VIEW": {
"CREATE_TEMP_VIEW": {
"message": [
"Instead remove the temporary view definition directly."
"Instead use the @temporary_view decorator to define temporary views."
]
},
"DROP_GLOBAL_TEMP_VIEW": {
"message": [
"Instead remove the temporary view definition directly."
]
},
"CREATE_TEMP_VIEW": {
"DROP_TEMP_VIEW": {
"message": [
"Instead use the @temporary_view decorator to define temporary views."
"Instead remove the temporary view definition directly."
]
},
"CREATE_OR_REPLACE_TEMP_VIEW": {
"REGISTER_JAVA_UDAF": {
"message": [
"Instead use the @temporary_view decorator to define temporary views."
""
]
},
"CREATE_GLOBAL_TEMP_VIEW": {
"REGISTER_JAVA_UDF": {
"message": [
"Instead use the @temporary_view decorator to define temporary views."
""
]
},
"CREATE_OR_REPLACE_GLOBAL_TEMP_VIEW": {
"REGISTER_UDF": {
"message": [
"Instead use the @temporary_view decorator to define temporary views."
""
]
},
"REGISTER_UDF": {
"SET_CURRENT_CATALOG": {
"message": [
""
"Instead set catalog via the pipeline spec or the 'name' argument on the dataset decorators."
]
},
"REGISTER_JAVA_UDF": {
"SET_CURRENT_DATABASE": {
"message": [
""
"Instead set database via the pipeline spec or the 'name' argument on the dataset decorators."
]
},
"REGISTER_JAVA_UDAF": {
"SET_RUNTIME_CONF": {
"message": [
""
"Instead set configuration via the pipeline spec or use the 'spark_conf' argument in various decorators."
]
}
}
Expand Down Expand Up @@ -1139,33 +1139,28 @@
"Caught StopIteration thrown from user's code; failing the task: <exc>"
]
},
"STOP_ITERATION_OCCURRED_FROM_SCALAR_ITER_PANDAS_UDF": {
"message": [
"pandas iterator UDF should exhaust the input iterator."
]
},
"STREAMING_CONNECT_SERIALIZATION_ERROR": {
"message": [
"Cannot serialize the function `<name>`. If you accessed the Spark session, or a DataFrame defined outside of the function, or any object that contains a Spark session, please be aware that they are not allowed in Spark Connect. For `foreachBatch`, please access the Spark session using `df.sparkSession`, where `df` is the first parameter in your `foreachBatch` function. For `StreamingQueryListener`, please access the Spark session using `self.spark`. For details please check out the PySpark doc for `foreachBatch` and `StreamingQueryListener`."
]
},
"ST_INVALID_ALGORITHM_VALUE" : {
"message" : [
"ST_INVALID_ALGORITHM_VALUE": {
"message": [
"Invalid or unsupported edge interpolation algorithm value: '<alg>'."
],
"sqlState" : "22023"
"sqlState": "22023"
},
"ST_INVALID_CRS_VALUE" : {
"message" : [
"ST_INVALID_CRS_VALUE": {
"message": [
"Invalid or unsupported CRS (coordinate reference system) value: '<crs>'."
],
"sqlState" : "22023"
"sqlState": "22023"
},
"ST_INVALID_SRID_VALUE" : {
"message" : [
"ST_INVALID_SRID_VALUE": {
"message": [
"Invalid or unsupported SRID (spatial reference identifier) value: <srid>."
],
"sqlState" : "22023"
"sqlState": "22023"
},
"TEST_CLASS_NOT_COMPILED": {
"message": [
Expand Down
7 changes: 3 additions & 4 deletions python/pyspark/sql/tests/arrow/test_arrow_cogrouped_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ def stats(key, left, right):
with self.quiet():
with self.assertRaisesRegex(
PythonException,
"Column names of the returned pyarrow.Table do not match specified schema. "
"Missing: m. Unexpected: v, v2.\n",
"Column names of the returned data do not match specified schema. "
"Missing: m. Unexpected: v, v2.",
):
# stats returns three columns while here we set schema with two columns
self.cogrouped.applyInArrow(stats, schema="id long, m double").collect()
Expand Down Expand Up @@ -231,8 +231,7 @@ def odd_means(key, left, _):
with self.quiet():
with self.assertRaisesRegex(
PythonException,
"Column names of the returned pyarrow.Table do not match specified schema. "
"Missing: m.\n",
"Column names of the returned data do not match specified schema. Missing: m.",
):
# stats returns one column for even keys while here we set schema with two columns
self.cogrouped.applyInArrow(odd_means, schema="id long, m double").collect()
Expand Down
7 changes: 3 additions & 4 deletions python/pyspark/sql/tests/arrow/test_arrow_grouped_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,8 @@ def stats(key, table):
for func_variation in function_variations(stats):
with self.assertRaisesRegex(
PythonException,
"Column names of the returned pyarrow.Table do not match specified schema. "
"Missing: m. Unexpected: v, v2.\n",
"Column names of the returned data do not match specified schema. "
"Missing: m. Unexpected: v, v2.",
):
# stats returns three columns while here we set schema with two columns
df.groupby("id").applyInArrow(
Expand Down Expand Up @@ -264,8 +264,7 @@ def odd_means(key, table):
with self.quiet():
with self.assertRaisesRegex(
PythonException,
"Column names of the returned pyarrow.Table do not match specified schema. "
"Missing: m.\n",
"Column names of the returned data do not match specified schema. Missing: m.",
):
# stats returns one column for even keys while here we set schema with two columns
df.groupby("id").applyInArrow(odd_means, schema="id long, m double").collect()
Expand Down
8 changes: 4 additions & 4 deletions python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ def merge_pandas(lft, rgt):
self._test_merge_error(
fn=merge_pandas,
errorClass=PythonException,
error_message_regex="Column names of the returned pandas.DataFrame "
"do not match specified schema. Unexpected: add, more.\n",
error_message_regex="Column names of the returned data "
"do not match specified schema. Unexpected: add, more.",
)

def test_apply_in_pandas_returning_no_column_names_and_wrong_amount(self):
Expand All @@ -229,8 +229,8 @@ def merge_pandas(lft, rgt):
self._test_merge_error(
fn=merge_pandas,
errorClass=PythonException,
error_message_regex="Number of columns of the returned pandas.DataFrame "
"doesn't match specified schema. Expected: 4 Actual: 6\n",
error_message_regex="Number of columns of the returned data "
"doesn't match specified schema. Expected: 4 Actual: 6",
)

def test_apply_in_pandas_returning_empty_dataframe(self):
Expand Down
12 changes: 6 additions & 6 deletions python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,8 @@ def test_apply_in_pandas_returning_wrong_column_names(self):
def check_apply_in_pandas_returning_wrong_column_names(self):
with self.assertRaisesRegex(
PythonException,
"Column names of the returned pandas.DataFrame do not match specified schema. "
"Missing: mean. Unexpected: median, std.\n",
"Column names of the returned data do not match specified schema. "
"Missing: mean. Unexpected: median, std.",
):
self._test_apply_in_pandas(
lambda key, pdf: pd.DataFrame(
Expand All @@ -318,8 +318,8 @@ def test_apply_in_pandas_returning_no_column_names_and_wrong_amount(self):
def check_apply_in_pandas_returning_no_column_names_and_wrong_amount(self):
with self.assertRaisesRegex(
PythonException,
"Number of columns of the returned pandas.DataFrame doesn't match "
"specified schema. Expected: 2 Actual: 3\n",
"Number of columns of the returned data doesn't match "
"specified schema. Expected: 2 Actual: 3",
):
self._test_apply_in_pandas(
lambda key, pdf: pd.DataFrame([key + (pdf.v.mean(), pdf.v.std())])
Expand Down Expand Up @@ -664,8 +664,8 @@ def invalid_positional_types(pdf):
with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}):
with self.assertRaisesRegex(
PythonException,
"Column names of the returned pandas.DataFrame do not match "
"specified schema. Missing: id. Unexpected: iid.\n",
"Column names of the returned data do not match "
"specified schema. Missing: id. Unexpected: iid.",
):
grouped_df.apply(column_name_typo).collect()
with self.assertRaisesRegex(Exception, "[D|d]ecimal.*got.*date"):
Expand Down
36 changes: 18 additions & 18 deletions python/pyspark/sql/tests/pandas/test_pandas_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,9 @@ def dataframes_with_other_column_names(iterator):

with self.assertRaisesRegex(
PythonException,
"PySparkRuntimeError: \\[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF\\] "
"Column names of the returned pandas.DataFrame do not match "
"specified schema. Missing: id. Unexpected: iid.\n",
"PySparkRuntimeError: \\[RESULT_COLUMN_NAMES_MISMATCH\\] "
"Column names of the returned data do not match "
"specified schema. Missing: id. Unexpected: iid.",
):
(
self.spark.range(10, numPartitions=3)
Expand All @@ -222,9 +222,9 @@ def dataframes_with_other_column_names(iterator):

with self.assertRaisesRegex(
PythonException,
"PySparkRuntimeError: \\[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF\\] "
"Column names of the returned pandas.DataFrame do not match "
"specified schema. Missing: id2.\n",
"PySparkRuntimeError: \\[RESULT_COLUMN_NAMES_MISMATCH\\] "
"Column names of the returned data do not match "
"specified schema. Missing: id2.",
):
(
self.spark.range(10, numPartitions=3)
Expand All @@ -243,18 +243,18 @@ def check_dataframes_with_less_columns(self):

with self.assertRaisesRegex(
PythonException,
"PySparkRuntimeError: \\[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF\\] "
"Column names of the returned pandas.DataFrame do not match "
"specified schema. Missing: id2.\n",
"PySparkRuntimeError: \\[RESULT_COLUMN_NAMES_MISMATCH\\] "
"Column names of the returned data do not match "
"specified schema. Missing: id2.",
):
f = self.identity_dataframes_iter("id", "value")
(df.mapInPandas(f, "id int, id2 long, value int").collect())

with self.assertRaisesRegex(
PythonException,
"PySparkRuntimeError: \\[RESULT_LENGTH_MISMATCH_FOR_PANDAS_UDF\\] "
"Number of columns of the returned pandas.DataFrame doesn't match "
"specified schema. Expected: 3 Actual: 2\n",
"PySparkRuntimeError: \\[RESULT_COLUMN_SCHEMA_MISMATCH\\] "
"Number of columns of the returned data doesn't match "
"specified schema. Expected: 3 Actual: 2",
):
f = self.identity_dataframes_wo_column_names_iter("id", "value")
(df.mapInPandas(f, "id int, id2 long, value int").collect())
Expand Down Expand Up @@ -361,9 +361,9 @@ def test_empty_dataframes_with_less_columns(self):
def check_empty_dataframes_with_less_columns(self):
with self.assertRaisesRegex(
PythonException,
"PySparkRuntimeError: \\[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF\\] "
"Column names of the returned pandas.DataFrame do not match "
"specified schema. Missing: value.\n",
"PySparkRuntimeError: \\[RESULT_COLUMN_NAMES_MISMATCH\\] "
"Column names of the returned data do not match "
"specified schema. Missing: value.",
):
f = self.dataframes_and_empty_dataframe_iter("id")
(
Expand All @@ -390,9 +390,9 @@ def empty_dataframes_with_other_columns(iterator):

with self.assertRaisesRegex(
PythonException,
"PySparkRuntimeError: \\[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF\\] "
"Column names of the returned pandas.DataFrame do not match "
"specified schema. Missing: id. Unexpected: iid.\n",
"PySparkRuntimeError: \\[RESULT_COLUMN_NAMES_MISMATCH\\] "
"Column names of the returned data do not match "
"specified schema. Missing: id. Unexpected: iid.",
):
(
self.spark.range(10, numPartitions=3)
Expand Down
Loading