Skip to content

Commit

Permalink
[SPARK-13657] [SQL] Support parsing very long AND/OR expressions
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

In order to avoid StackOverflow when parse a expression with hundreds of ORs, we should use loop instead of recursive functions to flatten the tree as list. This PR also build a balanced tree to reduce the depth of generated And/Or expression, to avoid StackOverflow in analyzer/optimizer.

## How was this patch tested?

Add new unit tests. Manually tested with TPCDS Q3 with hundreds predicates in it [1]. These predicates help to reduce the number of partitions, then the query time went from 60 seconds to 8 seconds.

[1] https://github.com/cloudera/impala-tpcds-kit/blob/master/queries/q3.sql

Author: Davies Liu <davies@databricks.com>

Closes #11501 from davies/long_or.
  • Loading branch information
Davies Liu authored and davies committed Mar 8, 2016
1 parent 54040f8 commit 78d3b60
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ package org.apache.spark.sql.catalyst.parser

import java.sql.Date

import scala.collection.mutable.ArrayBuffer
import scala.util.matching.Regex

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis._
Expand Down Expand Up @@ -523,6 +526,39 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
noParseRule("Select", node)
}

/**
* Flattens the left deep tree with the specified pattern into a list.
*/
private def flattenLeftDeepTree(node: ASTNode, pattern: Regex): Seq[ASTNode] = {
val collected = ArrayBuffer[ASTNode]()
var rest = node
while (rest match {
case Token(pattern(), l :: r :: Nil) =>
collected += r
rest = l
true
case _ => false
}) {
// do nothing
}
collected += rest
// keep them in the same order as in SQL
collected.reverse
}

/**
* Creates a balanced tree that has similar number of nodes on left and right.
*
* This help to reduce the depth of the tree to prevent StackOverflow in analyzer/optimizer.
*/
private def balancedTree(
expr: Seq[Expression],
f: (Expression, Expression) => Expression): Expression = expr.length match {
case 1 => expr.head
case 2 => f(expr.head, expr(1))
case l => f(balancedTree(expr.slice(0, l / 2), f), balancedTree(expr.slice(l / 2, l), f))
}

protected def nodeToExpr(node: ASTNode): Expression = node match {
/* Attribute References */
case Token("TOK_TABLE_OR_COL", Token(name, Nil) :: Nil) =>
Expand Down Expand Up @@ -635,8 +671,10 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
}

/* Boolean Logic */
case Token(AND(), left :: right:: Nil) => And(nodeToExpr(left), nodeToExpr(right))
case Token(OR(), left :: right:: Nil) => Or(nodeToExpr(left), nodeToExpr(right))
case Token(AND(), left :: right:: Nil) =>
balancedTree(flattenLeftDeepTree(node, AND).map(nodeToExpr), And)
case Token(OR(), left :: right:: Nil) =>
balancedTree(flattenLeftDeepTree(node, OR).map(nodeToExpr), Or)
case Token(NOT(), child :: Nil) => Not(nodeToExpr(child))
case Token("!", child :: Nil) => Not(nodeToExpr(child))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,17 @@ class CatalystQlSuite extends PlanTest {
"from windowData")
}

test("very long AND/OR expression") {
val equals = (1 to 1000).map(x => s"$x == $x")
val expr = parser.parseExpression(equals.mkString(" AND "))
assert(expr.isInstanceOf[And])
assert(expr.collect( { case EqualTo(_, _) => true } ).size == 1000)

val expr2 = parser.parseExpression(equals.mkString(" OR "))
assert(expr2.isInstanceOf[Or])
assert(expr2.collect( { case EqualTo(_, _) => true } ).size == 1000)
}

test("subquery") {
parser.parsePlan("select (select max(b) from s) ss from t")
parser.parsePlan("select * from t where a = (select b from s)")
Expand Down

0 comments on commit 78d3b60

Please sign in to comment.