From 3b69777924d0ac54bc4b6ec9c740cb20774bf033 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 20 Nov 2017 07:13:32 +0000 Subject: [PATCH 1/3] Add document for udf. --- python/pyspark/sql/functions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 087ce7caa89c8..829451191153d 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2205,6 +2205,10 @@ def udf(f=None, returnType=StringType()): rows that do not satisfy the conditions, the suggested workaround is to incorporate the condition logic into the functions. + .. note:: Users can't rely on short-curcuit evaluation of boolean expressions to execute + conditionally user-defined functions too. For example, the two functions in an expression + like udf1(x) && udf2(y) will be both executed on all rows. + :param f: python function if used as a standalone function :param returnType: a :class:`pyspark.sql.types.DataType` object From 8efb9c2f132704b0a16f205c470800ebf725c939 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 21 Nov 2017 03:18:46 +0000 Subject: [PATCH 2/3] Revise doc. --- python/pyspark/sql/functions.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 829451191153d..e49b1fb889b42 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2198,16 +2198,9 @@ def udf(f=None, returnType=StringType()): duplicate invocations may be eliminated or the function may even be invoked more times than it is present in the query. - .. note:: The user-defined functions do not support conditional execution by using them with - SQL conditional expressions such as `when` or `if`. The functions still apply on all rows no - matter the conditions are met or not. So the output is correct if the functions can be - correctly run on all rows without failure. If the functions can cause runtime failure on the - rows that do not satisfy the conditions, the suggested workaround is to incorporate the - condition logic into the functions. - - .. note:: Users can't rely on short-curcuit evaluation of boolean expressions to execute - conditionally user-defined functions too. For example, the two functions in an expression - like udf1(x) && udf2(y) will be both executed on all rows. + .. note:: The user-defined functions do not support conditional expressions or short curcuiting + in boolean expressions and it ends up with being executed all internally. If the functions + can fail on special rows, the workaround is to incorporate the condition into the functions. :param f: python function if used as a standalone function :param returnType: a :class:`pyspark.sql.types.DataType` object From e6775809d80d110f814615223c5800d94f595195 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 21 Nov 2017 07:59:21 +0000 Subject: [PATCH 3/3] Revise doc for pandas_udf. --- python/pyspark/sql/functions.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index e49b1fb889b42..425a3fdf4446a 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2296,12 +2296,9 @@ def pandas_udf(f=None, returnType=StringType()): .. note:: The user-defined function must be deterministic. - .. note:: The user-defined functions do not support conditional execution by using them with - SQL conditional expressions such as `when` or `if`. The functions still apply on all rows no - matter the conditions are met or not. So the output is correct if the functions can be - correctly run on all rows without failure. If the functions can cause runtime failure on the - rows that do not satisfy the conditions, the suggested workaround is to incorporate the - condition logic into the functions. + .. note:: The user-defined functions do not support conditional expressions or short curcuiting + in boolean expressions and it ends up with being executed all internally. If the functions + can fail on special rows, the workaround is to incorporate the condition into the functions. """ return _create_udf(f, returnType=returnType, pythonUdfType=PythonUdfType.PANDAS_UDF)