-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-12656] [SQL] Implement Intersect with Left-semi Join #10630
Changes from 43 commits
01e4cdf
6835704
9180687
b38a21e
d2b84af
fda8025
ac0dccd
6e0018b
0546772
b37a64f
c2a872c
ab6dbd7
4276356
0bd1771
7bd102b
bfa99c5
cd23b03
100174a
9aad1cf
6742984
e4c34f0
2dab708
9864b3f
24cea7d
27192be
a932cdb
04a26bd
0458770
6a52e2b
f820c61
4372170
1debdfa
763706d
4de6ec1
9422a4f
52bdf48
1e95df3
d59b37b
6a7979d
fd87585
e566d79
3be78c4
e51de8f
b600089
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -214,12 +214,22 @@ trait CheckAnalysis { | |
s"""Only a single table generating function is allowed in a SELECT clause, found: | ||
| ${exprs.map(_.prettyString).mkString(",")}""".stripMargin) | ||
|
||
// Special handling for cases when self-join introduce duplicate expression ids. | ||
case j @ Join(left, right, _, _) if left.outputSet.intersect(right.outputSet).nonEmpty => | ||
val conflictingAttributes = left.outputSet.intersect(right.outputSet) | ||
case j: Join if !j.duplicateResolved => | ||
val conflictingAttributes = j.left.outputSet.intersect(j.right.outputSet) | ||
failAnalysis( | ||
s""" | ||
|Failure when resolving conflicting references in Join: | ||
|Failure when resolving conflicting references | ||
|in operator ${operator.simpleString}: | ||
|$plan | ||
|Conflicting attributes: ${conflictingAttributes.mkString(",")} | ||
|""".stripMargin) | ||
|
||
case i: Intersect if !i.duplicateResolved => | ||
val conflictingAttributes = i.left.outputSet.intersect(i.right.outputSet) | ||
failAnalysis( | ||
s""" | ||
|Failure when resolving conflicting references | ||
|in operator ${operator.simpleString}: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here, we could say |
||
|$plan | ||
|Conflicting attributes: ${conflictingAttributes.mkString(",")} | ||
|""".stripMargin) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,8 +52,10 @@ abstract class Optimizer extends RuleExecutor[LogicalPlan] { | |
// since the other rules might make two separate Unions operators adjacent. | ||
Batch("Union", Once, | ||
CombineUnions) :: | ||
Batch("Replace Operators", FixedPoint(100), | ||
ReplaceIntersectWithSemiJoin, | ||
ReplaceDistinctWithAggregate) :: | ||
Batch("Aggregate", FixedPoint(100), | ||
ReplaceDistinctWithAggregate, | ||
RemoveLiteralFromGroupExpressions) :: | ||
Batch("Operator Optimizations", FixedPoint(100), | ||
// Operator push down | ||
|
@@ -124,18 +126,13 @@ object EliminateSerialization extends Rule[LogicalPlan] { | |
} | ||
|
||
/** | ||
* Pushes certain operations to both sides of a Union, Intersect or Except operator. | ||
* Pushes certain operations to both sides of a Union or Except operator. | ||
* Operations that are safe to pushdown are listed as follows. | ||
* Union: | ||
* Right now, Union means UNION ALL, which does not de-duplicate rows. So, it is | ||
* safe to pushdown Filters and Projections through it. Once we add UNION DISTINCT, | ||
* we will not be able to pushdown Projections. | ||
* | ||
* Intersect: | ||
* It is not safe to pushdown Projections through it because we need to get the | ||
* intersect of rows by comparing the entire rows. It is fine to pushdown Filters | ||
* with deterministic condition. | ||
* | ||
* Except: | ||
* It is not safe to pushdown Projections through it because we need to get the | ||
* intersect of rows by comparing the entire rows. It is fine to pushdown Filters | ||
|
@@ -153,7 +150,7 @@ object SetOperationPushDown extends Rule[LogicalPlan] with PredicateHelper { | |
|
||
/** | ||
* Rewrites an expression so that it can be pushed to the right side of a | ||
* Union, Intersect or Except operator. This method relies on the fact that the output attributes | ||
* Union or Except operator. This method relies on the fact that the output attributes | ||
* of a union/intersect/except are always equal to the left child's output. | ||
*/ | ||
private def pushToRight[A <: Expression](e: A, rewrites: AttributeMap[Attribute]) = { | ||
|
@@ -210,17 +207,6 @@ object SetOperationPushDown extends Rule[LogicalPlan] with PredicateHelper { | |
} | ||
Filter(nondeterministic, Union(newFirstChild +: newOtherChildren)) | ||
|
||
// Push down filter through INTERSECT | ||
case Filter(condition, Intersect(left, right)) => | ||
val (deterministic, nondeterministic) = partitionByDeterministic(condition) | ||
val rewrites = buildRewrites(left, right) | ||
Filter(nondeterministic, | ||
Intersect( | ||
Filter(deterministic, left), | ||
Filter(pushToRight(deterministic, rewrites), right) | ||
) | ||
) | ||
|
||
// Push down filter through EXCEPT | ||
case Filter(condition, Except(left, right)) => | ||
val (deterministic, nondeterministic) = partitionByDeterministic(condition) | ||
|
@@ -1054,6 +1040,27 @@ object ReplaceDistinctWithAggregate extends Rule[LogicalPlan] { | |
} | ||
} | ||
|
||
/** | ||
* Replaces logical [[Intersect]] operator with a left-semi [[Join]] operator. | ||
* {{{ | ||
* SELECT a1, a2 FROM Tab1 INTERSECT SELECT b1, b2 FROM Tab2 | ||
* ==> SELECT DISTINCT a1, a2 FROM Tab1 LEFT SEMI JOIN Tab2 ON a1<=>b1 AND a2<=>b2 | ||
* }}} | ||
* | ||
* Note: | ||
* 1. This rule is only applicable to INTERSECT DISTINCT. Do not use it for INTERSECT ALL. | ||
* 2. This rule has to be done after de-duplicating the attributes; otherwise, the generated | ||
* join conditions will be incorrect. | ||
*/ | ||
object ReplaceIntersectWithSemiJoin extends Rule[LogicalPlan] { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to add a comment at here to mention that this rewrite is just for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, will do it. Actually, I will also implement |
||
def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use transformUp? cc @yhuai There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually nvm. |
||
case Intersect(left, right) => | ||
assert(left.output.size == right.output.size) | ||
val joinCond = left.output.zip(right.output).map { case (l, r) => EqualNullSafe(l, r) } | ||
Distinct(Join(left, right, LeftSemi, joinCond.reduceLeftOption(And))) | ||
} | ||
} | ||
|
||
/** | ||
* Removes literals from group expressions in [[Aggregate]], as they have no effect to the result | ||
* but only makes the grouping key bigger. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -315,4 +315,7 @@ abstract class BinaryNode extends LogicalPlan { | |
def right: LogicalPlan | ||
|
||
override def children: Seq[LogicalPlan] = Seq(left, right) | ||
|
||
override lazy val resolved: Boolean = | ||
expressions.forall(_.resolved) && childrenResolved | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why override this? |
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -270,7 +270,8 @@ class AnalysisErrorSuite extends AnalysisTest { | |
val error = intercept[AnalysisException] { | ||
SimpleAnalyzer.checkAnalysis(join) | ||
} | ||
assert(error.message.contains("Failure when resolving conflicting references in Join")) | ||
assert(error.message.contains("Failure when resolving conflicting references\n" + | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can revert this after revert the error message change. |
||
"in operator 'Join")) | ||
assert(error.message.contains("Conflicting attributes")) | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
now we can keep this message as it only checks join :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can users observe the error? or it can be considered as an internal errors? BTW, we are about to convert it to an internal error in the PR: #41476