From fe3e34dda68fd54212df1dd01b8acb9a9bc6a0ad Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Tue, 9 Jul 2019 10:50:07 +0800 Subject: [PATCH] [SPARK-28273][SQL][PYTHON] Convert and port 'pgSQL/case.sql' into UDF test base ## What changes were proposed in this pull request? This PR adds some tests converted from `pgSQL/case.sql'` to test UDFs. Please see contribution guide of this umbrella ticket - [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). This PR also contains two minor fixes: 1. Change name of Scala UDF from `UDF:name(...)` to `name(...)` to be consistent with Python' 2. Fix Scala UDF at `IntegratedUDFTestUtils.scala ` to handle `null` in strings.
Diff comparing to 'pgSQL/case.sql'

```diff diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/case.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out index fa078d16d6d..55bef64338f 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/case.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out -115,7 +115,7 struct<> -- !query 13 SELECT '3' AS `One`, CASE - WHEN 1 < 2 THEN 3 + WHEN CAST(udf(1 < 2) AS boolean) THEN 3 END AS `Simple WHEN` -- !query 13 schema struct -126,10 +126,10 struct -- !query 14 SELECT '' AS `One`, CASE - WHEN 1 > 2 THEN 3 + WHEN 1 > 2 THEN udf(3) END AS `Simple default` -- !query 14 schema -struct +struct -- !query 14 output NULL -137,17 +137,17 struct -- !query 15 SELECT '3' AS `One`, CASE - WHEN 1 < 2 THEN 3 - ELSE 4 + WHEN udf(1) < 2 THEN udf(3) + ELSE udf(4) END AS `Simple ELSE` -- !query 15 schema -struct +struct -- !query 15 output 3 3 -- !query 16 -SELECT '4' AS `One`, +SELECT udf('4') AS `One`, CASE WHEN 1 > 2 THEN 3 ELSE 4 -159,10 +159,10 struct -- !query 17 -SELECT '6' AS `One`, +SELECT udf('6') AS `One`, CASE - WHEN 1 > 2 THEN 3 - WHEN 4 < 5 THEN 6 + WHEN CAST(udf(1 > 2) AS boolean) THEN 3 + WHEN udf(4) < 5 THEN 6 ELSE 7 END AS `Two WHEN with default` -- !query 17 schema -173,7 +173,7 struct -- !query 18 SELECT '7' AS `None`, - CASE WHEN rand() < 0 THEN 1 + CASE WHEN rand() < udf(0) THEN 1 END AS `NULL on no matches` -- !query 18 schema struct -182,36 +182,36 struct -- !query 19 -SELECT CASE WHEN 1=0 THEN 1/0 WHEN 1=1 THEN 1 ELSE 2/0 END +SELECT CASE WHEN CAST(udf(1=0) AS boolean) THEN 1/0 WHEN 1=1 THEN 1 ELSE 2/0 END -- !query 19 schema -struct +struct -- !query 19 output 1.0 -- !query 20 -SELECT CASE 1 WHEN 0 THEN 1/0 WHEN 1 THEN 1 ELSE 2/0 END +SELECT CASE 1 WHEN 0 THEN 1/udf(0) WHEN 1 THEN 1 ELSE 2/0 END -- !query 20 schema -struct +struct -- !query 20 output 1.0 -- !query 21 -SELECT CASE WHEN i > 100 THEN 1/0 ELSE 0 END FROM case_tbl +SELECT CASE WHEN i > 100 THEN udf(1/0) ELSE udf(0) END FROM case_tbl -- !query 21 schema -struct 100) THEN (CAST(1 AS DOUBLE) / CAST(0 AS DOUBLE)) ELSE CAST(0 AS DOUBLE) END:double> +struct 100) THEN udf((cast(1 as double) / cast(0 as double))) ELSE udf(0) END:string> -- !query 21 output -0.0 -0.0 -0.0 -0.0 +0 +0 +0 +0 -- !query 22 -SELECT CASE 'a' WHEN 'a' THEN 1 ELSE 2 END +SELECT CASE 'a' WHEN 'a' THEN udf(1) ELSE udf(2) END -- !query 22 schema -struct +struct -- !query 22 output 1 -283,7 +283,7 big -- !query 27 -SELECT * FROM CASE_TBL WHERE COALESCE(f,i) = 4 +SELECT * FROM CASE_TBL WHERE udf(COALESCE(f,i)) = 4 -- !query 27 schema struct -- !query 27 output -291,7 +291,7 struct -- !query 28 -SELECT * FROM CASE_TBL WHERE NULLIF(f,i) = 2 +SELECT * FROM CASE_TBL WHERE udf(NULLIF(f,i)) = 2 -- !query 28 schema struct -- !query 28 output -299,10 +299,10 struct -- !query 29 -SELECT COALESCE(a.f, b.i, b.j) +SELECT udf(COALESCE(a.f, b.i, b.j)) FROM CASE_TBL a, CASE2_TBL b -- !query 29 schema -struct +struct -- !query 29 output -30.3 -30.3 -332,8 +332,8 struct -- !query 30 SELECT * - FROM CASE_TBL a, CASE2_TBL b - WHERE COALESCE(a.f, b.i, b.j) = 2 + FROM CASE_TBL a, CASE2_TBL b + WHERE udf(COALESCE(a.f, b.i, b.j)) = 2 -- !query 30 schema struct -- !query 30 output -342,7 +342,7 struct -- !query 31 -SELECT '' AS Five, NULLIF(a.i,b.i) AS `NULLIF(a.i,b.i)`, +SELECT udf('') AS Five, NULLIF(a.i,b.i) AS `NULLIF(a.i,b.i)`, NULLIF(b.i, 4) AS `NULLIF(b.i,4)` FROM CASE_TBL a, CASE2_TBL b -- !query 31 schema -377,7 +377,7 struct -- !query 32 SELECT '' AS `Two`, * FROM CASE_TBL a, CASE2_TBL b - WHERE COALESCE(f,b.i) = 2 + WHERE CAST(udf(COALESCE(f,b.i) = 2) AS boolean) -- !query 32 schema struct -- !query 32 output -388,15 +388,15 struct -- !query 33 SELECT CASE (CASE vol('bar') - WHEN 'foo' THEN 'it was foo!' - WHEN vol(null) THEN 'null input' + WHEN udf('foo') THEN 'it was foo!' + WHEN udf(vol(null)) THEN 'null input' WHEN 'bar' THEN 'it was bar!' END ) - WHEN 'it was foo!' THEN 'foo recognized' - WHEN 'it was bar!' THEN 'bar recognized' - ELSE 'unrecognized' END + WHEN udf('it was foo!') THEN 'foo recognized' + WHEN 'it was bar!' THEN udf('bar recognized') + ELSE 'unrecognized' END AS col -- !query 33 schema -struct +struct -- !query 33 output bar recognized ```

https://github.com/apache/spark/pull/25069 contains the same minor fixes as it's required to write the tests. ## How was this patch tested? Tested as guided in [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). Closes #25070 from HyukjinKwon/SPARK-28273. Authored-by: HyukjinKwon Signed-off-by: Wenchen Fan --- python/pyspark/sql/udf.py | 6 +- .../sql/catalyst/expressions/ScalaUDF.scala | 3 +- .../sql-tests/inputs/udf/pgSQL/udf-case.sql | 272 +++++++++++ .../sql-tests/results/pgSQL/case.sql.out | 2 +- .../results/udf/pgSQL/udf-case.sql.out | 425 ++++++++++++++++++ .../spark/sql/IntegratedUDFTestUtils.scala | 5 +- .../apache/spark/sql/SQLQueryTestSuite.scala | 7 +- .../scala/org/apache/spark/sql/UDFSuite.scala | 6 +- 8 files changed, 713 insertions(+), 13 deletions(-) create mode 100644 sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-case.sql create mode 100644 sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py index 0944c874fa275..188ec2634974a 100644 --- a/python/pyspark/sql/udf.py +++ b/python/pyspark/sql/udf.py @@ -376,17 +376,17 @@ def registerJavaFunction(self, name, javaClassName, returnType=None): >>> spark.udf.registerJavaFunction( ... "javaStringLength", "test.org.apache.spark.sql.JavaStringLength", IntegerType()) >>> spark.sql("SELECT javaStringLength('test')").collect() - [Row(UDF:javaStringLength(test)=4)] + [Row(javaStringLength(test)=4)] >>> spark.udf.registerJavaFunction( ... "javaStringLength2", "test.org.apache.spark.sql.JavaStringLength") >>> spark.sql("SELECT javaStringLength2('test')").collect() - [Row(UDF:javaStringLength2(test)=4)] + [Row(javaStringLength2(test)=4)] >>> spark.udf.registerJavaFunction( ... "javaStringLength3", "test.org.apache.spark.sql.JavaStringLength", "integer") >>> spark.sql("SELECT javaStringLength3('test')").collect() - [Row(UDF:javaStringLength3(test)=4)] + [Row(javaStringLength3(test)=4)] """ jdt = None diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala index 3274b66e98481..10f8ec9617d1b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala @@ -57,8 +57,7 @@ case class ScalaUDF( override lazy val deterministic: Boolean = udfDeterministic && children.forall(_.deterministic) - override def toString: String = - s"${udfName.map(name => s"UDF:$name").getOrElse("UDF")}(${children.mkString(", ")})" + override def toString: String = s"${udfName.getOrElse("UDF")}(${children.mkString(", ")})" // scalastyle:off line.size.limit diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-case.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-case.sql new file mode 100644 index 0000000000000..5ec4cb1d6326f --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-case.sql @@ -0,0 +1,272 @@ +-- +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- +-- CASE +-- https://github.com/postgres/postgres/blob/REL_12_BETA1/src/test/regress/sql/case.sql +-- Test the CASE statement +-- +-- This test suite contains two Cartesian products without using explicit CROSS JOIN syntax. +-- Thus, we set spark.sql.crossJoin.enabled to true. + +-- This test file was converted from pgSQL/case.sql. +-- Note that currently registered UDF returns a string. So there are some differences, for instance +-- in string cast within UDF in Scala and Python. + +set spark.sql.crossJoin.enabled=true; +CREATE TABLE CASE_TBL ( + i integer, + f double +) USING parquet; + +CREATE TABLE CASE2_TBL ( + i integer, + j integer +) USING parquet; + +INSERT INTO CASE_TBL VALUES (1, 10.1); +INSERT INTO CASE_TBL VALUES (2, 20.2); +INSERT INTO CASE_TBL VALUES (3, -30.3); +INSERT INTO CASE_TBL VALUES (4, NULL); + +INSERT INTO CASE2_TBL VALUES (1, -1); +INSERT INTO CASE2_TBL VALUES (2, -2); +INSERT INTO CASE2_TBL VALUES (3, -3); +INSERT INTO CASE2_TBL VALUES (2, -4); +INSERT INTO CASE2_TBL VALUES (1, NULL); +INSERT INTO CASE2_TBL VALUES (NULL, -6); + +-- +-- Simplest examples without tables +-- + +SELECT '3' AS `One`, + CASE + WHEN CAST(udf(1 < 2) AS boolean) THEN 3 + END AS `Simple WHEN`; + +SELECT '' AS `One`, + CASE + WHEN 1 > 2 THEN udf(3) + END AS `Simple default`; + +SELECT '3' AS `One`, + CASE + WHEN udf(1) < 2 THEN udf(3) + ELSE udf(4) + END AS `Simple ELSE`; + +SELECT udf('4') AS `One`, + CASE + WHEN 1 > 2 THEN 3 + ELSE 4 + END AS `ELSE default`; + +SELECT udf('6') AS `One`, + CASE + WHEN CAST(udf(1 > 2) AS boolean) THEN 3 + WHEN udf(4) < 5 THEN 6 + ELSE 7 + END AS `Two WHEN with default`; + +SELECT '7' AS `None`, + CASE WHEN rand() < udf(0) THEN 1 + END AS `NULL on no matches`; + +-- Constant-expression folding shouldn't evaluate unreachable subexpressions +SELECT CASE WHEN CAST(udf(1=0) AS boolean) THEN 1/0 WHEN 1=1 THEN 1 ELSE 2/0 END; +SELECT CASE 1 WHEN 0 THEN 1/udf(0) WHEN 1 THEN 1 ELSE 2/0 END; + +-- [SPARK-27923] PostgreSQL throws an exception but Spark SQL is NULL +-- However we do not currently suppress folding of potentially +-- reachable subexpressions +SELECT CASE WHEN i > 100 THEN udf(1/0) ELSE udf(0) END FROM case_tbl; + +-- Test for cases involving untyped literals in test expression +SELECT CASE 'a' WHEN 'a' THEN udf(1) ELSE udf(2) END; + +-- +-- Examples of targets involving tables +-- + +SELECT '' AS `Five`, + CASE + WHEN i >= 3 THEN i + END AS `>= 3 or Null` + FROM CASE_TBL; + +SELECT '' AS `Five`, + CASE WHEN i >= 3 THEN (i + i) + ELSE i + END AS `Simplest Math` + FROM CASE_TBL; + +SELECT '' AS `Five`, i AS `Value`, + CASE WHEN (i < 0) THEN 'small' + WHEN (i = 0) THEN 'zero' + WHEN (i = 1) THEN 'one' + WHEN (i = 2) THEN 'two' + ELSE 'big' + END AS `Category` + FROM CASE_TBL; + +SELECT '' AS `Five`, + CASE WHEN ((i < 0) or (i < 0)) THEN 'small' + WHEN ((i = 0) or (i = 0)) THEN 'zero' + WHEN ((i = 1) or (i = 1)) THEN 'one' + WHEN ((i = 2) or (i = 2)) THEN 'two' + ELSE 'big' + END AS `Category` + FROM CASE_TBL; + +-- +-- Examples of qualifications involving tables +-- + +-- +-- NULLIF() and COALESCE() +-- Shorthand forms for typical CASE constructs +-- defined in the SQL standard. +-- + +SELECT * FROM CASE_TBL WHERE udf(COALESCE(f,i)) = 4; + +SELECT * FROM CASE_TBL WHERE udf(NULLIF(f,i)) = 2; + +SELECT udf(COALESCE(a.f, b.i, b.j)) + FROM CASE_TBL a, CASE2_TBL b; + +SELECT * + FROM CASE_TBL a, CASE2_TBL b + WHERE udf(COALESCE(a.f, b.i, b.j)) = 2; + +SELECT udf('') AS Five, NULLIF(a.i,b.i) AS `NULLIF(a.i,b.i)`, + NULLIF(b.i, 4) AS `NULLIF(b.i,4)` + FROM CASE_TBL a, CASE2_TBL b; + +SELECT '' AS `Two`, * + FROM CASE_TBL a, CASE2_TBL b + WHERE CAST(udf(COALESCE(f,b.i) = 2) AS boolean); + +-- We don't support update now. +-- +-- Examples of updates involving tables +-- + +-- UPDATE CASE_TBL +-- SET i = CASE WHEN i >= 3 THEN (- i) +-- ELSE (2 * i) END; + +-- SELECT * FROM CASE_TBL; + +-- UPDATE CASE_TBL +-- SET i = CASE WHEN i >= 2 THEN (2 * i) +-- ELSE (3 * i) END; + +-- SELECT * FROM CASE_TBL; + +-- UPDATE CASE_TBL +-- SET i = CASE WHEN b.i >= 2 THEN (2 * j) +-- ELSE (3 * j) END +-- FROM CASE2_TBL b +-- WHERE j = -CASE_TBL.i; + +-- SELECT * FROM CASE_TBL; + +-- +-- Nested CASE expressions +-- + +-- This test exercises a bug caused by aliasing econtext->caseValue_isNull +-- with the isNull argument of the inner CASE's CaseExpr evaluation. After +-- evaluating the vol(null) expression in the inner CASE's second WHEN-clause, +-- the isNull flag for the case test value incorrectly became true, causing +-- the third WHEN-clause not to match. The volatile function calls are needed +-- to prevent constant-folding in the planner, which would hide the bug. + +-- Wrap this in a single transaction so the transient '=' operator doesn't +-- cause problems in concurrent sessions +-- BEGIN; + +-- CREATE FUNCTION vol(text) returns text as +-- 'begin return $1; end' language plpgsql volatile; + +SELECT CASE + (CASE vol('bar') + WHEN udf('foo') THEN 'it was foo!' + WHEN udf(vol(null)) THEN 'null input' + WHEN 'bar' THEN 'it was bar!' END + ) + WHEN udf('it was foo!') THEN 'foo recognized' + WHEN 'it was bar!' THEN udf('bar recognized') + ELSE 'unrecognized' END AS col; + +-- We don't support the features below: +-- 1. CREATE DOMAIN ... +-- 2. CREATE OPERATOR ... +-- 3. CREATE TYPE ... + +-- In this case, we can't inline the SQL function without confusing things. +-- CREATE DOMAIN foodomain AS text; + +-- CREATE FUNCTION volfoo(text) returns foodomain as +-- 'begin return $1::foodomain; end' language plpgsql volatile; + +-- CREATE FUNCTION inline_eq(foodomain, foodomain) returns boolean as +-- 'SELECT CASE $2::text WHEN $1::text THEN true ELSE false END' language sql; + +-- CREATE OPERATOR = (procedure = inline_eq, +-- leftarg = foodomain, rightarg = foodomain); + +-- SELECT CASE volfoo('bar') WHEN 'foo'::foodomain THEN 'is foo' ELSE 'is not foo' END; + +-- ROLLBACK; + +-- Test multiple evaluation of a CASE arg that is a read/write object (#14472) +-- Wrap this in a single transaction so the transient '=' operator doesn't +-- cause problems in concurrent sessions +-- BEGIN; + +-- CREATE DOMAIN arrdomain AS int[]; + +-- CREATE FUNCTION make_ad(int,int) returns arrdomain as +-- 'declare x arrdomain; +-- begin +-- x := array[$1,$2]; +-- return x; +-- end' language plpgsql volatile; + +-- CREATE FUNCTION ad_eq(arrdomain, arrdomain) returns boolean as +-- 'begin return array_eq($1, $2); end' language plpgsql; + +-- CREATE OPERATOR = (procedure = ad_eq, +-- leftarg = arrdomain, rightarg = arrdomain); + +-- SELECT CASE make_ad(1,2) +-- WHEN array[2,4]::arrdomain THEN 'wrong' +-- WHEN array[2,5]::arrdomain THEN 'still wrong' +-- WHEN array[1,2]::arrdomain THEN 'right' +-- END; + +-- ROLLBACK; + +-- Test interaction of CASE with ArrayCoerceExpr (bug #15471) +-- BEGIN; + +-- CREATE TYPE casetestenum AS ENUM ('e', 'f', 'g'); + +-- SELECT +-- CASE 'foo'::text +-- WHEN 'foo' THEN ARRAY['a', 'b', 'c', 'd'] || enum_range(NULL::casetestenum)::text[] +-- ELSE ARRAY['x', 'y'] +-- END; + +-- ROLLBACK; + +-- +-- Clean up +-- + +DROP TABLE CASE_TBL; +DROP TABLE CASE2_TBL; +set spark.sql.crossJoin.enabled=false; diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/case.sql.out b/sql/core/src/test/resources/sql-tests/results/pgSQL/case.sql.out index fa078d16d6d36..dbd775e5ebba9 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/case.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/pgSQL/case.sql.out @@ -396,7 +396,7 @@ SELECT CASE WHEN 'it was bar!' THEN 'bar recognized' ELSE 'unrecognized' END -- !query 33 schema -struct +struct -- !query 33 output bar recognized diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out new file mode 100644 index 0000000000000..55bef64338f4e --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out @@ -0,0 +1,425 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 37 + + +-- !query 0 +set spark.sql.crossJoin.enabled=true +-- !query 0 schema +struct +-- !query 0 output +spark.sql.crossJoin.enabled true + + +-- !query 1 +CREATE TABLE CASE_TBL ( + i integer, + f double +) USING parquet +-- !query 1 schema +struct<> +-- !query 1 output + + + +-- !query 2 +CREATE TABLE CASE2_TBL ( + i integer, + j integer +) USING parquet +-- !query 2 schema +struct<> +-- !query 2 output + + + +-- !query 3 +INSERT INTO CASE_TBL VALUES (1, 10.1) +-- !query 3 schema +struct<> +-- !query 3 output + + + +-- !query 4 +INSERT INTO CASE_TBL VALUES (2, 20.2) +-- !query 4 schema +struct<> +-- !query 4 output + + + +-- !query 5 +INSERT INTO CASE_TBL VALUES (3, -30.3) +-- !query 5 schema +struct<> +-- !query 5 output + + + +-- !query 6 +INSERT INTO CASE_TBL VALUES (4, NULL) +-- !query 6 schema +struct<> +-- !query 6 output + + + +-- !query 7 +INSERT INTO CASE2_TBL VALUES (1, -1) +-- !query 7 schema +struct<> +-- !query 7 output + + + +-- !query 8 +INSERT INTO CASE2_TBL VALUES (2, -2) +-- !query 8 schema +struct<> +-- !query 8 output + + + +-- !query 9 +INSERT INTO CASE2_TBL VALUES (3, -3) +-- !query 9 schema +struct<> +-- !query 9 output + + + +-- !query 10 +INSERT INTO CASE2_TBL VALUES (2, -4) +-- !query 10 schema +struct<> +-- !query 10 output + + + +-- !query 11 +INSERT INTO CASE2_TBL VALUES (1, NULL) +-- !query 11 schema +struct<> +-- !query 11 output + + + +-- !query 12 +INSERT INTO CASE2_TBL VALUES (NULL, -6) +-- !query 12 schema +struct<> +-- !query 12 output + + + +-- !query 13 +SELECT '3' AS `One`, + CASE + WHEN CAST(udf(1 < 2) AS boolean) THEN 3 + END AS `Simple WHEN` +-- !query 13 schema +struct +-- !query 13 output +3 3 + + +-- !query 14 +SELECT '' AS `One`, + CASE + WHEN 1 > 2 THEN udf(3) + END AS `Simple default` +-- !query 14 schema +struct +-- !query 14 output + NULL + + +-- !query 15 +SELECT '3' AS `One`, + CASE + WHEN udf(1) < 2 THEN udf(3) + ELSE udf(4) + END AS `Simple ELSE` +-- !query 15 schema +struct +-- !query 15 output +3 3 + + +-- !query 16 +SELECT udf('4') AS `One`, + CASE + WHEN 1 > 2 THEN 3 + ELSE 4 + END AS `ELSE default` +-- !query 16 schema +struct +-- !query 16 output +4 4 + + +-- !query 17 +SELECT udf('6') AS `One`, + CASE + WHEN CAST(udf(1 > 2) AS boolean) THEN 3 + WHEN udf(4) < 5 THEN 6 + ELSE 7 + END AS `Two WHEN with default` +-- !query 17 schema +struct +-- !query 17 output +6 6 + + +-- !query 18 +SELECT '7' AS `None`, + CASE WHEN rand() < udf(0) THEN 1 + END AS `NULL on no matches` +-- !query 18 schema +struct +-- !query 18 output +7 NULL + + +-- !query 19 +SELECT CASE WHEN CAST(udf(1=0) AS boolean) THEN 1/0 WHEN 1=1 THEN 1 ELSE 2/0 END +-- !query 19 schema +struct +-- !query 19 output +1.0 + + +-- !query 20 +SELECT CASE 1 WHEN 0 THEN 1/udf(0) WHEN 1 THEN 1 ELSE 2/0 END +-- !query 20 schema +struct +-- !query 20 output +1.0 + + +-- !query 21 +SELECT CASE WHEN i > 100 THEN udf(1/0) ELSE udf(0) END FROM case_tbl +-- !query 21 schema +struct 100) THEN udf((cast(1 as double) / cast(0 as double))) ELSE udf(0) END:string> +-- !query 21 output +0 +0 +0 +0 + + +-- !query 22 +SELECT CASE 'a' WHEN 'a' THEN udf(1) ELSE udf(2) END +-- !query 22 schema +struct +-- !query 22 output +1 + + +-- !query 23 +SELECT '' AS `Five`, + CASE + WHEN i >= 3 THEN i + END AS `>= 3 or Null` + FROM CASE_TBL +-- !query 23 schema +struct= 3 or Null:int> +-- !query 23 output +3 + 4 + NULL + NULL + + +-- !query 24 +SELECT '' AS `Five`, + CASE WHEN i >= 3 THEN (i + i) + ELSE i + END AS `Simplest Math` + FROM CASE_TBL +-- !query 24 schema +struct +-- !query 24 output +1 + 2 + 6 + 8 + + +-- !query 25 +SELECT '' AS `Five`, i AS `Value`, + CASE WHEN (i < 0) THEN 'small' + WHEN (i = 0) THEN 'zero' + WHEN (i = 1) THEN 'one' + WHEN (i = 2) THEN 'two' + ELSE 'big' + END AS `Category` + FROM CASE_TBL +-- !query 25 schema +struct +-- !query 25 output +1 one + 2 two + 3 big + 4 big + + +-- !query 26 +SELECT '' AS `Five`, + CASE WHEN ((i < 0) or (i < 0)) THEN 'small' + WHEN ((i = 0) or (i = 0)) THEN 'zero' + WHEN ((i = 1) or (i = 1)) THEN 'one' + WHEN ((i = 2) or (i = 2)) THEN 'two' + ELSE 'big' + END AS `Category` + FROM CASE_TBL +-- !query 26 schema +struct +-- !query 26 output +big + big + one + two + + +-- !query 27 +SELECT * FROM CASE_TBL WHERE udf(COALESCE(f,i)) = 4 +-- !query 27 schema +struct +-- !query 27 output +4 NULL + + +-- !query 28 +SELECT * FROM CASE_TBL WHERE udf(NULLIF(f,i)) = 2 +-- !query 28 schema +struct +-- !query 28 output + + + +-- !query 29 +SELECT udf(COALESCE(a.f, b.i, b.j)) + FROM CASE_TBL a, CASE2_TBL b +-- !query 29 schema +struct +-- !query 29 output +-30.3 +-30.3 +-30.3 +-30.3 +-30.3 +-30.3 +-6.0 +1.0 +1.0 +10.1 +10.1 +10.1 +10.1 +10.1 +10.1 +2.0 +2.0 +20.2 +20.2 +20.2 +20.2 +20.2 +20.2 +3.0 + + +-- !query 30 +SELECT * + FROM CASE_TBL a, CASE2_TBL b + WHERE udf(COALESCE(a.f, b.i, b.j)) = 2 +-- !query 30 schema +struct +-- !query 30 output +4 NULL 2 -2 +4 NULL 2 -4 + + +-- !query 31 +SELECT udf('') AS Five, NULLIF(a.i,b.i) AS `NULLIF(a.i,b.i)`, + NULLIF(b.i, 4) AS `NULLIF(b.i,4)` + FROM CASE_TBL a, CASE2_TBL b +-- !query 31 schema +struct +-- !query 31 output +1 2 + 1 2 + 1 3 + 1 NULL + 2 1 + 2 1 + 2 3 + 2 NULL + 3 1 + 3 1 + 3 2 + 3 2 + 3 NULL + 4 1 + 4 1 + 4 2 + 4 2 + 4 3 + 4 NULL + NULL 1 + NULL 1 + NULL 2 + NULL 2 + NULL 3 + + +-- !query 32 +SELECT '' AS `Two`, * + FROM CASE_TBL a, CASE2_TBL b + WHERE CAST(udf(COALESCE(f,b.i) = 2) AS boolean) +-- !query 32 schema +struct +-- !query 32 output +4 NULL 2 -2 + 4 NULL 2 -4 + + +-- !query 33 +SELECT CASE + (CASE vol('bar') + WHEN udf('foo') THEN 'it was foo!' + WHEN udf(vol(null)) THEN 'null input' + WHEN 'bar' THEN 'it was bar!' END + ) + WHEN udf('it was foo!') THEN 'foo recognized' + WHEN 'it was bar!' THEN udf('bar recognized') + ELSE 'unrecognized' END AS col +-- !query 33 schema +struct +-- !query 33 output +bar recognized + + +-- !query 34 +DROP TABLE CASE_TBL +-- !query 34 schema +struct<> +-- !query 34 output + + + +-- !query 35 +DROP TABLE CASE2_TBL +-- !query 35 schema +struct<> +-- !query 35 output + + + +-- !query 36 +set spark.sql.crossJoin.enabled=false +-- !query 36 schema +struct +-- !query 36 output +spark.sql.crossJoin.enabled false diff --git a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala index 76ec85120e86f..7caf6241bb984 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala @@ -251,9 +251,10 @@ object IntegratedUDFTestUtils extends SQLHelper { */ case class TestScalaUDF(name: String) extends TestUDF { private[IntegratedUDFTestUtils] lazy val udf = SparkUserDefinedFunction( - (input: Any) => input.toString, + (input: Any) => String.valueOf(input), StringType, - inputSchemas = Seq.fill(1)(None)) + inputSchemas = Seq.fill(1)(None), + name = Some(name)) def apply(exprs: Column*): Column = udf(exprs: _*) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index 1c8cf6403c6c2..c8a187b57a610 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -255,12 +255,15 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext { val localSparkSession = spark.newSession() loadTestData(localSparkSession) testCase match { - case udfTestCase: UDFTestCase => registerTestUDF(udfTestCase.udf, localSparkSession) + case udfTestCase: UDFTestCase => + // vol used by udf-case.sql. + localSparkSession.udf.register("vol", (s: String) => s) + registerTestUDF(udfTestCase.udf, localSparkSession) case _: PgSQLTestCase => // booleq/boolne used by boolean.sql localSparkSession.udf.register("booleq", (b1: Boolean, b2: Boolean) => b1 == b2) localSparkSession.udf.register("boolne", (b1: Boolean, b2: Boolean) => b1 != b2) - // vol used by boolean.sql + // vol used by boolean.sql and case.sql. localSparkSession.udf.register("vol", (s: String) => s) case _ => // Don't add UDFs in Regular tests. } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index f2a71bd628bdd..f155b5dc80cf1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -316,9 +316,9 @@ class UDFSuite extends QueryTest with SharedSQLContext { val udf2Name = "myUdf2" val udf1 = spark.udf.register(udf1Name, (n: Int) => n + 1) val udf2 = spark.udf.register(udf2Name, (n: Int) => n * 1) - assert(explainStr(sql("SELECT myUdf1(myUdf2(1))")).contains(s"UDF:$udf1Name(UDF:$udf2Name(1))")) + assert(explainStr(sql("SELECT myUdf1(myUdf2(1))")).contains(s"$udf1Name($udf2Name(1))")) assert(explainStr(spark.range(1).select(udf1(udf2(functions.lit(1))))) - .contains(s"UDF:$udf1Name(UDF:$udf2Name(1))")) + .contains(s"$udf1Name($udf2Name(1))")) } test("SPARK-23666 Do not display exprId in argument names") { @@ -329,7 +329,7 @@ class UDFSuite extends QueryTest with SharedSQLContext { Console.withOut(outputStream) { spark.sql("SELECT f(a._1) FROM x").show } - assert(outputStream.toString.contains("UDF:f(a._1 AS `_1`)")) + assert(outputStream.toString.contains("f(a._1 AS `_1`)")) } }