[SPARK-13657] [SQL] Support parsing very long AND/OR expressions

## What changes were proposed in this pull request? In order to avoid StackOverflow when parse a expression with hundreds of ORs, we should use loop instead of recursive functions to flatten the tree as list. This PR also build a balanced tree to reduce the depth of generated And/Or expression, to avoid StackOverflow in analyzer/optimizer. ## How was this patch tested? Add new unit tests. Manually tested with TPCDS Q3 with hundreds predicates in it [1]. These predicates help to reduce the number of partitions, then the query time went from 60 seconds to 8 seconds. [1] https://github.com/cloudera/impala-tpcds-kit/blob/master/queries/q3.sql Author: Davies Liu <davies@databricks.com> Closes #11501 from davies/long_or.
apache · Mar 8, 2016 · 78d3b60 · 78d3b60
1 parent 54040f8
commit 78d3b60
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 2 deletions.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala
@@ -19,6 +19,9 @@ package org.apache.spark.sql.catalyst.parser
 
 import java.sql.Date
 
+import scala.collection.mutable.ArrayBuffer
+import scala.util.matching.Regex
+
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis._
@@ -523,6 +526,39 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       noParseRule("Select", node)
   }
 
+  /**
+   * Flattens the left deep tree with the specified pattern into a list.
+   */
+  private def flattenLeftDeepTree(node: ASTNode, pattern: Regex): Seq[ASTNode] = {
+    val collected = ArrayBuffer[ASTNode]()
+    var rest = node
+    while (rest match {
+      case Token(pattern(), l :: r :: Nil) =>
+        collected += r
+        rest = l
+        true
+      case _ => false
+    }) {
+      // do nothing
+    }
+    collected += rest
+    // keep them in the same order as in SQL
+    collected.reverse
+  }
+
+  /**
+   * Creates a balanced tree that has similar number of nodes on left and right.
+   *
+   * This help to reduce the depth of the tree to prevent StackOverflow in analyzer/optimizer.
+   */
+  private def balancedTree(
+      expr: Seq[Expression],
+      f: (Expression, Expression) => Expression): Expression = expr.length match {
+    case 1 => expr.head
+    case 2 => f(expr.head, expr(1))
+    case l => f(balancedTree(expr.slice(0, l / 2), f), balancedTree(expr.slice(l / 2, l), f))
+  }
+
   protected def nodeToExpr(node: ASTNode): Expression = node match {
     /* Attribute References */
     case Token("TOK_TABLE_OR_COL", Token(name, Nil) :: Nil) =>
@@ -635,8 +671,10 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       }
 
     /* Boolean Logic */
-    case Token(AND(), left :: right:: Nil) => And(nodeToExpr(left), nodeToExpr(right))
-    case Token(OR(), left :: right:: Nil) => Or(nodeToExpr(left), nodeToExpr(right))
+    case Token(AND(), left :: right:: Nil) =>
+      balancedTree(flattenLeftDeepTree(node, AND).map(nodeToExpr), And)
+    case Token(OR(), left :: right:: Nil) =>
+      balancedTree(flattenLeftDeepTree(node, OR).map(nodeToExpr), Or)
     case Token(NOT(), child :: Nil) => Not(nodeToExpr(child))
     case Token("!", child :: Nil) => Not(nodeToExpr(child))
 

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/CatalystQlSuite.scala
@@ -203,6 +203,17 @@ class CatalystQlSuite extends PlanTest {
       "from windowData")
   }
 
+  test("very long AND/OR expression") {
+    val equals = (1 to 1000).map(x => s"$x == $x")
+    val expr = parser.parseExpression(equals.mkString(" AND "))
+    assert(expr.isInstanceOf[And])
+    assert(expr.collect( { case EqualTo(_, _) => true } ).size == 1000)
+
+    val expr2 = parser.parseExpression(equals.mkString(" OR "))
+    assert(expr2.isInstanceOf[Or])
+    assert(expr2.collect( { case EqualTo(_, _) => true } ).size == 1000)
+  }
+
   test("subquery") {
     parser.parsePlan("select (select max(b) from s) ss from t")
     parser.parsePlan("select * from t where a = (select b from s)")