Skip to content

Commit

Permalink
[SPARK-24891][SQL] Fix HandleNullInputsForUDF rule
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

The HandleNullInputsForUDF would always add a new `If` node every time it is applied. That would cause a difference between the same plan being analyzed once and being analyzed twice (or more), thus raising issues like plan not matched in the cache manager. The solution is to mark the arguments as null-checked, which is to add a "KnownNotNull" node above those arguments, when adding the UDF under an `If` node, because clearly the UDF will not be called when any of those arguments is null.

## How was this patch tested?

Add new tests under sql/UDFSuite and AnalysisSuite.

Author: maryannxue <maryannxue@apache.org>

Closes #21851 from maryannxue/spark-24891.
  • Loading branch information
maryannxue authored and gatorsmile committed Jul 25, 2018
1 parent 15fff79 commit c26b092
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 10 deletions.
Expand Up @@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.encoders.OuterScopes
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.expressions.objects.{LambdaVariable, MapObjects, NewInstance, UnresolvedMapObjects}
import org.apache.spark.sql.catalyst.expressions.objects._
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
Expand Down Expand Up @@ -2145,14 +2145,24 @@ class Analyzer(
val parameterTypes = ScalaReflection.getParameterTypes(func)
assert(parameterTypes.length == inputs.length)

// TODO: skip null handling for not-nullable primitive inputs after we can completely
// trust the `nullable` information.
// (cls, expr) => cls.isPrimitive && expr.nullable
val needsNullCheck = (cls: Class[_], expr: Expression) =>
cls.isPrimitive && !expr.isInstanceOf[KnowNotNull]
val inputsNullCheck = parameterTypes.zip(inputs)
// TODO: skip null handling for not-nullable primitive inputs after we can completely
// trust the `nullable` information.
// .filter { case (cls, expr) => cls.isPrimitive && expr.nullable }
.filter { case (cls, _) => cls.isPrimitive }
.filter { case (cls, expr) => needsNullCheck(cls, expr) }
.map { case (_, expr) => IsNull(expr) }
.reduceLeftOption[Expression]((e1, e2) => Or(e1, e2))
inputsNullCheck.map(If(_, Literal.create(null, udf.dataType), udf)).getOrElse(udf)
// Once we add an `If` check above the udf, it is safe to mark those checked inputs
// as not nullable (i.e., wrap them with `KnownNotNull`), because the null-returning
// branch of `If` will be called if any of these checked inputs is null. Thus we can
// prevent this rule from being applied repeatedly.
val newInputs = parameterTypes.zip(inputs).map{ case (cls, expr) =>
if (needsNullCheck(cls, expr)) KnowNotNull(expr) else expr }
inputsNullCheck
.map(If(_, Literal.create(null, udf.dataType), udf.copy(children = newInputs)))
.getOrElse(udf)
}
}
}
Expand Down
@@ -0,0 +1,35 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, FalseLiteral}
import org.apache.spark.sql.types.DataType

case class KnowNotNull(child: Expression) extends UnaryExpression {
override def nullable: Boolean = false
override def dataType: DataType = child.dataType

override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
child.genCode(ctx).copy(isNull = FalseLiteral)
}

override def eval(input: InternalRow): Any = {
child.eval(input)
}
}
Expand Up @@ -316,15 +316,16 @@ class AnalysisSuite extends AnalysisTest with Matchers {

// only primitive parameter needs special null handling
val udf2 = ScalaUDF((s: String, d: Double) => "x", StringType, string :: double :: Nil)
val expected2 = If(IsNull(double), nullResult, udf2)
val expected2 =
If(IsNull(double), nullResult, udf2.copy(children = string :: KnowNotNull(double) :: Nil))
checkUDF(udf2, expected2)

// special null handling should apply to all primitive parameters
val udf3 = ScalaUDF((s: Short, d: Double) => "x", StringType, short :: double :: Nil)
val expected3 = If(
IsNull(short) || IsNull(double),
nullResult,
udf3)
udf3.copy(children = KnowNotNull(short) :: KnowNotNull(double) :: Nil))
checkUDF(udf3, expected3)

// we can skip special null handling for primitive parameters that are not nullable
Expand All @@ -336,10 +337,19 @@ class AnalysisSuite extends AnalysisTest with Matchers {
val expected4 = If(
IsNull(short),
nullResult,
udf4)
udf4.copy(children = KnowNotNull(short) :: double.withNullability(false) :: Nil))
// checkUDF(udf4, expected4)
}

test("SPARK-24891 Fix HandleNullInputsForUDF rule") {
val a = testRelation.output(0)
val func = (x: Int, y: Int) => x + y
val udf1 = ScalaUDF(func, IntegerType, a :: a :: Nil)
val udf2 = ScalaUDF(func, IntegerType, a :: udf1 :: Nil)
val plan = Project(Alias(udf2, "")() :: Nil, testRelation)
comparePlans(plan.analyze, plan.analyze.analyze)
}

test("SPARK-11863 mixture of aliases and real columns in order by clause - tpcds 19,55,71") {
val a = testRelation2.output(0)
val c = testRelation2.output(2)
Expand Down
31 changes: 30 additions & 1 deletion sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
Expand Up @@ -20,7 +20,7 @@ package org.apache.spark.sql
import org.apache.spark.sql.api.java._
import org.apache.spark.sql.catalyst.plans.logical.Project
import org.apache.spark.sql.execution.command.ExplainCommand
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.functions.{lit, udf}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.test.SQLTestData._
import org.apache.spark.sql.types.{DataTypes, DoubleType}
Expand Down Expand Up @@ -324,4 +324,33 @@ class UDFSuite extends QueryTest with SharedSQLContext {
assert(outputStream.toString.contains("UDF:f(a._1 AS `_1`)"))
}
}

test("SPARK-24891 Fix HandleNullInputsForUDF rule") {
val udf1 = udf({(x: Int, y: Int) => x + y})
val df = spark.range(0, 3).toDF("a")
.withColumn("b", udf1($"a", udf1($"a", lit(10))))
.withColumn("c", udf1($"a", lit(null)))
val plan = spark.sessionState.executePlan(df.logicalPlan).analyzed

comparePlans(df.logicalPlan, plan)
checkAnswer(
df,
Seq(
Row(0, 10, null),
Row(1, 12, null),
Row(2, 14, null)))
}

test("SPARK-24891 Fix HandleNullInputsForUDF rule - with table") {
withTable("x") {
Seq((1, "2"), (2, "4")).toDF("a", "b").write.format("json").saveAsTable("x")
sql("insert into table x values(3, null)")
sql("insert into table x values(null, '4')")
spark.udf.register("f", (a: Int, b: String) => a + b)
val df = spark.sql("SELECT f(a, b) FROM x")
val plan = spark.sessionState.executePlan(df.logicalPlan).analyzed
comparePlans(df.logicalPlan, plan)
checkAnswer(df, Seq(Row("12"), Row("24"), Row("3null"), Row(null)))
}
}
}

0 comments on commit c26b092

Please sign in to comment.