-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-13732] [SPARK-13797] [SQL] Remove projectList from Window and Eliminate useless Window #11565
[SPARK-13732] [SPARK-13797] [SQL] Remove projectList from Window and Eliminate useless Window #11565
Changes from 43 commits
01e4cdf
6835704
9180687
b38a21e
d2b84af
fda8025
ac0dccd
6e0018b
0546772
b37a64f
c2a872c
ab6dbd7
4276356
2dab708
0458770
1debdfa
763706d
4de6ec1
9422a4f
52bdf48
1e95df3
fab24cf
8b2e33b
2ee1876
b9f0090
ade6f7e
9fd63d2
5199d49
404214c
c001dd9
59daa48
41d5f64
25f6ff6
467b095
b169236
b229ea2
472a6e3
60fcafa
f0fbe78
f8fd37f
4dd3e66
6cf6f44
44326f1
fc96d84
6a59b42
bd35ee7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -315,21 +315,17 @@ object SetOperationPushDown extends Rule[LogicalPlan] with PredicateHelper { | |
* - LeftSemiJoin | ||
*/ | ||
object ColumnPruning extends Rule[LogicalPlan] { | ||
def sameOutput(output1: Seq[Attribute], output2: Seq[Attribute]): Boolean = | ||
private def sameOutput(output1: Seq[Attribute], output2: Seq[Attribute]): Boolean = | ||
output1.size == output2.size && | ||
output1.zip(output2).forall(pair => pair._1.semanticEquals(pair._2)) | ||
|
||
def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
// Prunes the unused columns from project list of Project/Aggregate/Window/Expand | ||
// Prunes the unused columns from project list of Project/Aggregate/Expand | ||
case p @ Project(_, p2: Project) if (p2.outputSet -- p.references).nonEmpty => | ||
p.copy(child = p2.copy(projectList = p2.projectList.filter(p.references.contains))) | ||
case p @ Project(_, a: Aggregate) if (a.outputSet -- p.references).nonEmpty => | ||
p.copy( | ||
child = a.copy(aggregateExpressions = a.aggregateExpressions.filter(p.references.contains))) | ||
case p @ Project(_, w: Window) if (w.outputSet -- p.references).nonEmpty => | ||
p.copy(child = w.copy( | ||
projectList = w.projectList.filter(p.references.contains), | ||
windowExpressions = w.windowExpressions.filter(p.references.contains))) | ||
case a @ Project(_, e @ Expand(_, _, grandChild)) if (e.outputSet -- a.references).nonEmpty => | ||
val newOutput = e.output.filter(a.references.contains(_)) | ||
val newProjects = e.projections.map { proj => | ||
|
@@ -343,11 +339,9 @@ object ColumnPruning extends Rule[LogicalPlan] { | |
case mp @ MapPartitions(_, _, _, child) if (child.outputSet -- mp.references).nonEmpty => | ||
mp.copy(child = prunedChild(child, mp.references)) | ||
|
||
// Prunes the unused columns from child of Aggregate/Window/Expand/Generate | ||
// Prunes the unused columns from child of Aggregate/Expand/Generate | ||
case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty => | ||
a.copy(child = prunedChild(child, a.references)) | ||
case w @ Window(_, _, _, _, child) if (child.outputSet -- w.references).nonEmpty => | ||
w.copy(child = prunedChild(child, w.references)) | ||
case e @ Expand(_, _, child) if (child.outputSet -- e.references).nonEmpty => | ||
e.copy(child = prunedChild(child, e.references)) | ||
case g: Generate if !g.join && (g.child.outputSet -- g.references).nonEmpty => | ||
|
@@ -384,9 +378,21 @@ object ColumnPruning extends Rule[LogicalPlan] { | |
// Eliminate no-op Projects | ||
case p @ Project(projectList, child) if sameOutput(child.output, p.output) => child | ||
|
||
// Eliminate no-op Window | ||
case w: Window if w.windowExpressions.isEmpty => w.child | ||
|
||
// Can't prune the columns on LeafNode | ||
case p @ Project(_, l: LeafNode) => p | ||
|
||
// Prune windowExpressions and child of Window | ||
case p @ Project(_, w: Window) if (w.outputSet -- p.references).nonEmpty => | ||
val newWindowExprs = w.windowExpressions.filter(p.references.contains) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After rethink about it, seems we can still separate it into 2 rules. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do it.Thanks! |
||
val newGrandChild = | ||
prunedChild(w.child, p.references ++ AttributeSet(newWindowExprs.flatMap(_.references))) | ||
p.copy(child = w.copy( | ||
windowExpressions = newWindowExprs, | ||
child = newGrandChild)) | ||
|
||
// for all other logical plans that inherits the output from it's children | ||
case p @ Project(_, child) => | ||
val required = child.references ++ p.references | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,8 @@ import org.apache.spark.sql.catalyst.analysis | |
import org.apache.spark.sql.catalyst.dsl.expressions._ | ||
import org.apache.spark.sql.catalyst.dsl.plans._ | ||
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder | ||
import org.apache.spark.sql.catalyst.expressions.{Ascending, Explode, Literal, SortOrder} | ||
import org.apache.spark.sql.catalyst.expressions._ | ||
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, Count} | ||
import org.apache.spark.sql.catalyst.plans.{Inner, PlanTest} | ||
import org.apache.spark.sql.catalyst.plans.logical._ | ||
import org.apache.spark.sql.catalyst.rules.RuleExecutor | ||
|
@@ -258,6 +259,71 @@ class ColumnPruningSuite extends PlanTest { | |
comparePlans(optimized1, analysis.EliminateSubqueryAliases(correctAnswer1)) | ||
} | ||
|
||
test("Column pruning on Window with useless aggregate functions") { | ||
val input = LocalRelation('a.int, 'b.string, 'c.double, 'd.int) | ||
|
||
val originalQuery = | ||
input.groupBy('a, 'c, 'd)('a, 'c, 'd, | ||
WindowExpression( | ||
AggregateExpression(Count('b), Complete, isDistinct = false), | ||
WindowSpecDefinition( 'a :: Nil, | ||
SortOrder('b, Ascending) :: Nil, | ||
UnspecifiedFrame)).as('window)).select('a, 'c) | ||
|
||
val correctAnswer = | ||
input.select('a, 'c, 'd).groupBy('a, 'c, 'd)('a, 'c).analyze | ||
|
||
val optimized = Optimize.execute(originalQuery.analyze) | ||
|
||
comparePlans(optimized, correctAnswer) | ||
} | ||
|
||
test("Column pruning on Window with selected agg expressions") { | ||
val input = LocalRelation('a.int, 'b.string, 'c.double, 'd.int) | ||
|
||
val originalQuery = | ||
input.select('a, 'b, 'c, 'd, | ||
WindowExpression( | ||
AggregateExpression(Count('b), Complete, isDistinct = false), | ||
WindowSpecDefinition( 'a :: Nil, | ||
SortOrder('b, Ascending) :: Nil, | ||
UnspecifiedFrame)).as('window)).where('window > 1).select('a, 'c) | ||
|
||
val correctAnswer = | ||
input.select('a, 'b, 'c) | ||
.window(WindowExpression( | ||
AggregateExpression(Count('b), Complete, isDistinct = false), | ||
WindowSpecDefinition( 'a :: Nil, | ||
SortOrder('b, Ascending) :: Nil, | ||
UnspecifiedFrame)).as('window) :: Nil, | ||
'a :: Nil, 'b.asc :: Nil) | ||
.select('a, 'c, 'window).select('a, 'c, 'window, 'window) | ||
.select('a, 'c, 'window).where('window > 1).select('a, 'c).analyze | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's weird to see 3 selects here, we can add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do. Thanks! |
||
|
||
val optimized = Optimize.execute(originalQuery.analyze) | ||
|
||
comparePlans(optimized, correctAnswer) | ||
} | ||
|
||
test("Column pruning on Window in select") { | ||
val input = LocalRelation('a.int, 'b.string, 'c.double, 'd.int) | ||
|
||
val originalQuery = | ||
input.select('a, 'b, 'c, 'd, | ||
WindowExpression( | ||
AggregateExpression(Count('b), Complete, isDistinct = false), | ||
WindowSpecDefinition( 'a :: Nil, | ||
SortOrder('b, Ascending) :: Nil, | ||
UnspecifiedFrame)).as('window)).select('a, 'c) | ||
|
||
val correctAnswer = | ||
input.select('a, 'c).analyze | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: they can fit in one line There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, will do. |
||
|
||
val optimized = Optimize.execute(originalQuery.analyze) | ||
|
||
comparePlans(optimized, correctAnswer) | ||
} | ||
|
||
test("Column pruning on Union") { | ||
val input1 = LocalRelation('a.int, 'b.string, 'c.double) | ||
val input2 = LocalRelation('c.int, 'd.string, 'e.double) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move this rule near the rule that filter out useless window expressions, which makes people eaiser to understand.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure.