-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-35080][SQL] Only allow a subset of correlated equality predicates when a subquery is aggregated #32179
Changes from 5 commits
3e18ade
dec271d
b8bf47f
2ad6548
03f4487
652b854
84f91b2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -899,14 +899,72 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog { | |
// +- SubqueryAlias t1, `t1` | ||
// +- Project [_1#73 AS c1#76, _2#74 AS c2#77] | ||
// +- LocalRelation [_1#73, _2#74] | ||
def failOnNonEqualCorrelatedPredicate(found: Boolean, p: LogicalPlan): Unit = { | ||
if (found) { | ||
// SPARK-35080: The same issue can happen to correlated equality predicates when | ||
// they do not guarantee one-to-one mapping between inner and outer attributes. | ||
// For example: | ||
// Table: | ||
// t1(a, b): [(0, 6), (1, 5), (2, 4)] | ||
// t2(c): [(6)] | ||
// | ||
// Query: | ||
// SELECT c, (SELECT COUNT(*) FROM t1 WHERE a + b = c) FROM t2 | ||
// | ||
// Original subquery plan: | ||
// Aggregate [count(1)] | ||
// +- Filter ((a + b) = outer(c)) | ||
// +- LocalRelation [a, b] | ||
// | ||
// Plan after pulling up correlated predicates: | ||
// Aggregate [a, b] [count(1), a, b] | ||
// +- LocalRelation [a, b] | ||
// | ||
// Plan after rewrite: | ||
// Project [c1, count(1)] | ||
// +- Join LeftOuter ((a + b) = c) | ||
// :- LocalRelation [c] | ||
// +- Aggregate [a, b] [count(1), a, b] | ||
// +- LocalRelation [a, b] | ||
// | ||
// The right hand side of the join transformed from the subquery will output | ||
// count(1) | a | b | ||
// 1 | 0 | 6 | ||
// 1 | 1 | 5 | ||
// 1 | 2 | 4 | ||
// and the plan after rewrite will give the original query incorrect results. | ||
def failOnUnsupportedCorrelatedPredicate(predicates: Seq[Expression], p: LogicalPlan): Unit = { | ||
if (predicates.nonEmpty) { | ||
// Report a non-supported case as an exception | ||
failAnalysis(s"Correlated column is not allowed in a non-equality predicate:\n$p") | ||
failAnalysis(s"Correlated column is not allowed in predicate " + | ||
s"${predicates.map(_.sql).mkString}:\n$p") | ||
} | ||
} | ||
|
||
var foundNonEqualCorrelatedPred: Boolean = false | ||
def containsAttribute(e: Expression): Boolean = { | ||
e.find(_.isInstanceOf[Attribute]).isDefined | ||
} | ||
|
||
// Given a correlated predicate, check if it is either a non-equality predicate or | ||
// equality predicate that does not guarantee one-on-one mapping between inner and | ||
// outer attributes. When the correlated predicate does not contain any attribute | ||
// (i.e. only has outer references), it is supported and should return false. E.G.: | ||
// (a = outer(c)) -> false | ||
// (outer(c) = outer(d)) -> false | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we have test case for it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes added a test case in SubquerySuite |
||
// (a > outer(c)) -> true | ||
// (a + b = outer(c)) -> true | ||
// The last one is true because there can be multiple combinations of (a, b) that | ||
// satisfy the equality condition. For example, if outer(c) = 0, then both (0, 0) | ||
// and (-1, 1) can make the predicate evaluate to true. | ||
def isUnsupportedPredicate(condition: Expression): Boolean = condition match { | ||
// Only allow equality condition with one side being an attribute and another | ||
// side being an expression without attributes from the inner query. Note | ||
// OuterReference is a leaf node and will not be found here. | ||
case Equality(_: Attribute, b) => containsAttribute(b) | ||
case Equality(a, _: Attribute) => containsAttribute(a) | ||
maropu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
case e @ Equality(_, _) => containsAttribute(e) | ||
case _ => true | ||
} | ||
|
||
val unsupportedPredicates = mutable.ArrayBuffer.empty[Expression] | ||
|
||
// Simplify the predicates before validating any unsupported correlation patterns in the plan. | ||
AnalysisHelper.allowInvokingTransformsInAnalyzer { BooleanSimplification(sub).foreachUp { | ||
|
@@ -949,22 +1007,17 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog { | |
// The other operator is Join. Filter can be anywhere in a correlated subquery. | ||
case f: Filter => | ||
val (correlated, _) = splitConjunctivePredicates(f.condition).partition(containsOuter) | ||
|
||
// Find any non-equality correlated predicates | ||
foundNonEqualCorrelatedPred = foundNonEqualCorrelatedPred || correlated.exists { | ||
case _: EqualTo | _: EqualNullSafe => false | ||
case _ => true | ||
} | ||
unsupportedPredicates ++= correlated.filter(isUnsupportedPredicate) | ||
failOnInvalidOuterReference(f) | ||
|
||
// Aggregate cannot host any correlated expressions | ||
// It can be on a correlation path if the correlation contains | ||
// only equality correlated predicates. | ||
// only supported correlated equality predicates. | ||
// It cannot be on a correlation path if the correlation has | ||
// non-equality correlated predicates. | ||
case a: Aggregate => | ||
failOnInvalidOuterReference(a) | ||
failOnNonEqualCorrelatedPredicate(foundNonEqualCorrelatedPred, a) | ||
failOnUnsupportedCorrelatedPredicate(unsupportedPredicates, a) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems to cause a compilation error with Scala 2.13. Could you re-check with Scala 2.13, @allisonwang-db ? Please try to add
|
||
|
||
// Join can host correlated expressions. | ||
case j @ Join(left, right, joinType, _, _) => | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: drop
s