apache · viirya · Feb 17, 2016 · Feb 17, 2016 · Feb 17, 2016 · Feb 17, 2016
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.analysis.{CleanupAliases, EliminateSubquery
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
-import org.apache.spark.sql.catalyst.planning.{ExtractFiltersAndInnerJoins, Unions}
+import org.apache.spark.sql.catalyst.planning.{ExtractEquiJoinKeys, ExtractFiltersAndInnerJoins, Unions}
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
@@ -57,6 +57,8 @@ abstract class Optimizer extends RuleExecutor[LogicalPlan] {
       ReplaceDistinctWithAggregate) ::
     Batch("Aggregate", FixedPoint(100),
       RemoveLiteralFromGroupExpressions) ::
+    Batch("Join", FixedPoint(100),
+      AddFilterOfNullForInnerJoin) ::
     Batch("Operator Optimizations", FixedPoint(100),
       // Operator push down
       SetOperationPushDown,
@@ -143,6 +145,56 @@ object EliminateSerialization extends Rule[LogicalPlan] {
   }
 }
 
+/**
+ * Add Filter to left and right of an inner Join to filter out rows with null keys.
+ * So we may not need to check nullability of keys while joining. Besides, by filtering
+ * out keys with null, we can also reduce data size in Join.
+ */
+object AddFilterOfNullForInnerJoin extends Rule[LogicalPlan] with PredicateHelper {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case ExtractEquiJoinKeys(Inner, leftKeys, rightKeys, condition, left, right) =>
+      val leftConditions = leftKeys.distinct.map { l =>
+        IsNotNull(l)
+      }.reduceLeft(And)
+
+      val rightConditions = rightKeys.distinct.map { r =>
+        IsNotNull(r)
+      }.reduceLeft(And)
+
+      val keysConditions = ExpressionSet(leftKeys.zip(rightKeys).map { lr =>
+        EqualTo(lr._2, lr._1)
+      }).reduceLeft(And)
+
+      val finalConditions = if (condition.isDefined) {
+        And(keysConditions, condition.get)
+      } else {
+        keysConditions
+      }
+
+      val leftHasAllNotNullPredicate = left.constraints.nonEmpty &&
+        left.constraints.filter(_.isInstanceOf[IsNotNull])
+          .forall(expr => leftConditions.references.intersect(expr.references).nonEmpty)
+
+      val rightHasAllNotNullPredicate = right.constraints.nonEmpty &&
+        right.constraints.filter(_.isInstanceOf[IsNotNull])
+          .forall(expr => rightConditions.references.intersect(expr.references).nonEmpty)
+
+      val newLeft = if (leftHasAllNotNullPredicate) {
+        left
+      } else {
+        Filter(leftConditions, left)
+      }
+
+      val newRight = if (rightHasAllNotNullPredicate) {
+        right
+      } else {
+        Filter(rightConditions, right)
+      }
+
+      Join(newLeft, newRight, Inner, Some(finalConditions))
+  }
+}
+
 /**
  * Pushes down [[LocalLimit]] beneath UNION ALL and beneath the streamed inputs of outer joins.
  */

diff --git a/...test/scala/org/apache/spark/sql/catalyst/optimizer/AddFilterOfNullForInnerJoinSuite.scala b/...test/scala/org/apache/spark/sql/catalyst/optimizer/AddFilterOfNullForInnerJoinSuite.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.analysis
+import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.types.IntegerType
+
+class AddFilterOfNullForInnerJoinSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("Subqueries", Once,
+        EliminateSubqueryAliases) ::
+      Batch("Add Filter", FixedPoint(100),
+        AddFilterOfNullForInnerJoin) :: Nil
+  }
+
+  val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
+
+  val testRelation1 = LocalRelation('d.int)
+
+  test("add filters to left and right of inner join") {
+    val x = testRelation.subquery('x)
+    val y = testRelation1.subquery('y)
+
+    val originalQuery = {
+      x.join(y, condition = Some("d".attr === "b".attr && "d".attr === "c".attr && "a".attr > 10))
+    }
+
+    val nullLiteral = Literal.create(null, IntegerType)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val left = testRelation.where(IsNotNull('b) && IsNotNull('c))
+    val right = testRelation1.where(IsNotNull('d))
+    val correctAnswer =
+      left.join(right,
+        condition = Some(("d".attr === "b".attr && "d".attr === "c".attr) && "a".attr > 10)).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("don't need to add filters to left and right of inner join if constraints are ready") {
+    // left and right of join node already have not null constraints
+    val x = testRelation.subquery('x).where(IsNotNull('b) && IsNotNull('c)).select('a, 'b, 'c)
+    val y = testRelation1.subquery('y).where(IsNotNull('d)).select('d)
+
+    val originalQuery = {
+      x.join(y, condition = Some("d".attr === "b".attr && "d".attr === "c".attr && "a".attr > 10))
+    }
+
+    val nullLiteral = Literal.create(null, IntegerType)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+    assert(optimized.collect {
+      case j @ Join(left, right, _, _)
+        if !left.isInstanceOf[Filter] && !right.isInstanceOf[Filter] => 1
+    }.nonEmpty)
+  }
+}