-
Notifications
You must be signed in to change notification settings - Fork 29.1k
[SPARK-22181][SQL]Adds ReplaceExceptWithFilter rule #19451
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1baecfd
ea7b150
0f9ea7c
5ca1e88
24b6b6c
5facb93
95a7d46
1690d7d
9319eaa
588e26d
f6c95d3
7030ab6
03040c1
e010651
ed54ffd
8bce960
40d8797
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.catalyst.optimizer | ||
|
|
||
| import scala.annotation.tailrec | ||
|
|
||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
|
|
||
|
|
||
| /** | ||
| * If one or both of the datasets in the logical [[Except]] operator are purely transformed using | ||
| * [[Filter]], this rule will replace logical [[Except]] operator with a [[Filter]] operator by | ||
| * flipping the filter condition of the right child. | ||
| * {{{ | ||
| * SELECT a1, a2 FROM Tab1 WHERE a2 = 12 EXCEPT SELECT a1, a2 FROM Tab1 WHERE a1 = 5 | ||
| * ==> SELECT DISTINCT a1, a2 FROM Tab1 WHERE a2 = 12 AND (a1 is null OR a1 <> 5) | ||
| * }}} | ||
| * | ||
| * Note: | ||
| * Before flipping the filter condition of the right node, we should: | ||
| * 1. Combine all it's [[Filter]]. | ||
| * 2. Apply InferFiltersFromConstraints rule (to take into account of NULL values in the condition). | ||
| */ | ||
| object ReplaceExceptWithFilter extends Rule[LogicalPlan] { | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = { | ||
| if (!plan.conf.replaceExceptWithFilter) { | ||
| return plan | ||
| } | ||
|
|
||
| plan.transform { | ||
| case Except(left, right) if isEligible(left, right) => | ||
| Distinct(Filter(Not(transformCondition(left, skipProject(right))), left)) | ||
| } | ||
| } | ||
|
|
||
| private def transformCondition(left: LogicalPlan, right: LogicalPlan): Expression = { | ||
| val filterCondition = | ||
| InferFiltersFromConstraints(combineFilters(right)).asInstanceOf[Filter].condition | ||
|
|
||
| val attributeNameMap: Map[String, Attribute] = left.output.map(x => (x.name, x)).toMap | ||
|
|
||
| filterCondition.transform { case a : AttributeReference => attributeNameMap(a.name) } | ||
| } | ||
|
|
||
| // TODO: This can be further extended in the future. | ||
| private def isEligible(left: LogicalPlan, right: LogicalPlan): Boolean = (left, right) match { | ||
| case (_, right @ (Project(_, _: Filter) | Filter(_, _))) => verifyConditions(left, right) | ||
| case _ => false | ||
| } | ||
|
|
||
| private def verifyConditions(left: LogicalPlan, right: LogicalPlan): Boolean = { | ||
| val leftProjectList = projectList(left) | ||
| val rightProjectList = projectList(right) | ||
|
|
||
| left.output.size == left.output.map(_.name).distinct.size && | ||
| left.find(_.expressions.exists(SubqueryExpression.hasSubquery)).isEmpty && | ||
| right.find(_.expressions.exists(SubqueryExpression.hasSubquery)).isEmpty && | ||
| Project(leftProjectList, nonFilterChild(skipProject(left))).sameResult( | ||
| Project(rightProjectList, nonFilterChild(skipProject(right)))) | ||
| } | ||
|
|
||
| private def projectList(node: LogicalPlan): Seq[NamedExpression] = node match { | ||
| case p: Project => p.projectList | ||
| case x => x.output | ||
| } | ||
|
|
||
| private def skipProject(node: LogicalPlan): LogicalPlan = node match { | ||
| case p: Project => p.child | ||
| case x => x | ||
| } | ||
|
|
||
| private def nonFilterChild(plan: LogicalPlan) = plan.find(!_.isInstanceOf[Filter]).getOrElse { | ||
| throw new IllegalStateException("Leaf node is expected") | ||
| } | ||
|
|
||
| private def combineFilters(plan: LogicalPlan): LogicalPlan = { | ||
| @tailrec | ||
| def iterate(plan: LogicalPlan, acc: LogicalPlan): LogicalPlan = { | ||
| if (acc.fastEquals(plan)) acc else iterate(acc, CombineFilters(acc)) | ||
| } | ||
| iterate(plan, CombineFilters(plan)) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| -- Tests different scenarios of except operation | ||
| create temporary view t1 as select * from values | ||
| ("one", 1), | ||
| ("two", 2), | ||
| ("three", 3), | ||
| ("one", NULL) | ||
| as t1(k, v); | ||
|
|
||
| create temporary view t2 as select * from values | ||
| ("one", 1), | ||
| ("two", 22), | ||
| ("one", 5), | ||
| ("one", NULL), | ||
| (NULL, 5) | ||
| as t2(k, v); | ||
|
|
||
|
|
||
| -- Except operation that will be replaced by left anti join | ||
| SELECT * FROM t1 EXCEPT SELECT * FROM t2; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To see whether these SQL use the rule you added, you can add You will see the optimized plan and check whether the rule is applicable.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW, because our plan is not stable, I am not asking you for outputting the plan in the test cases. This is just to help you debug the codes. Thanks!
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, good to know. I have been manually testing it via spark shell.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about the result?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All verified via spark shell using
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And please let me know when it is right time to squash the commits.. |
||
|
|
||
|
|
||
| -- Except operation that will be replaced by Filter: SPARK-22181 | ||
| SELECT * FROM t1 EXCEPT SELECT * FROM t1 where v <> 1 and v <> 2; | ||
|
|
||
|
|
||
| -- Except operation that will be replaced by Filter: SPARK-22181 | ||
| SELECT * FROM t1 where v <> 1 and v <> 22 EXCEPT SELECT * FROM t1 where v <> 2 and v >= 3; | ||
|
|
||
|
|
||
| -- Except operation that will be replaced by Filter: SPARK-22181 | ||
| SELECT t1.* FROM t1, t2 where t1.k = t2.k | ||
| EXCEPT | ||
| SELECT t1.* FROM t1, t2 where t1.k = t2.k and t1.k != 'one'; | ||
|
|
||
|
|
||
| -- Except operation that will be replaced by left anti join | ||
| SELECT * FROM t2 where v >= 1 and v <> 22 EXCEPT SELECT * FROM t1; | ||
|
|
||
|
|
||
| -- Except operation that will be replaced by left anti join | ||
| SELECT (SELECT min(k) FROM t2 WHERE t2.k = t1.k) min_t2 FROM t1 | ||
| MINUS | ||
| SELECT (SELECT min(k) FROM t2) abs_min_t2 FROM t1 WHERE t1.k = 'one'; | ||
|
|
||
|
|
||
| -- Except operation that will be replaced by left anti join | ||
| SELECT t1.k | ||
| FROM t1 | ||
| WHERE t1.v <= (SELECT max(t2.v) | ||
| FROM t2 | ||
| WHERE t2.k = t1.k) | ||
| MINUS | ||
| SELECT t1.k | ||
| FROM t1 | ||
| WHERE t1.v >= (SELECT min(t2.v) | ||
| FROM t2 | ||
| WHERE t2.k = t1.k); | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not being used, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this function is not used anywhere.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You need to address https://github.com/apache/spark/pull/19451/files#r146315930