-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-29343][SQL] Eliminate sorts without limit in the subquery of Join/Aggregation #26011
Changes from 10 commits
75b43f5
e29b323
25a3bb8
d21c683
d2328a0
a9e9be9
425f76d
9072db7
8082324
f2d9ec1
4eccd2a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -97,7 +97,6 @@ abstract class Optimizer(catalogManager: CatalogManager) | |
SimplifyBinaryComparison, | ||
ReplaceNullWithFalseInPredicate, | ||
PruneFilters, | ||
EliminateSorts, | ||
SimplifyCasts, | ||
SimplifyCaseConversionExpressions, | ||
RewriteCorrelatedScalarSubquery, | ||
|
@@ -174,8 +173,8 @@ abstract class Optimizer(catalogManager: CatalogManager) | |
// idempotence enforcement on this batch. We thus make it FixedPoint(1) instead of Once. | ||
Batch("Join Reorder", FixedPoint(1), | ||
CostBasedJoinReorder) :+ | ||
Batch("Remove Redundant Sorts", Once, | ||
RemoveRedundantSorts) :+ | ||
Batch("Eliminate Sorts", Once, | ||
EliminateSorts) :+ | ||
Batch("Decimal Optimizations", fixedPoint, | ||
DecimalAggregates) :+ | ||
Batch("Object Expressions Optimization", fixedPoint, | ||
|
@@ -953,40 +952,60 @@ object CombineFilters extends Rule[LogicalPlan] with PredicateHelper { | |
} | ||
|
||
/** | ||
* Removes no-op SortOrder from Sort | ||
* Removes Sort operation. This can happen: | ||
* 1) if the sort order is empty or the sort order does not have any reference | ||
* 2) if the child is already sorted | ||
* 3) if there is another Sort operator separated by 0...n Project/Filter operators | ||
* 4) if the Sort operator is within Join without Limit and having deterministic conditions only | ||
* 5) if the Sort operator is within GroupBy and the aggregate function is order irrelevant | ||
*/ | ||
object EliminateSorts extends Rule[LogicalPlan] { | ||
def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
case s @ Sort(orders, _, child) if orders.isEmpty || orders.exists(_.child.foldable) => | ||
val newOrders = orders.filterNot(_.child.foldable) | ||
if (newOrders.isEmpty) child else s.copy(order = newOrders) | ||
} | ||
} | ||
|
||
/** | ||
* Removes redundant Sort operation. This can happen: | ||
* 1) if the child is already sorted | ||
* 2) if there is another Sort operator separated by 0...n Project/Filter operators | ||
*/ | ||
object RemoveRedundantSorts extends Rule[LogicalPlan] { | ||
def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { | ||
case Sort(orders, true, child) if SortOrder.orderingSatisfies(child.outputOrdering, orders) => | ||
child | ||
case s @ Sort(_, _, child) => s.copy(child = recursiveRemoveSort(child)) | ||
case j @ Join(originLeft, originRight, _, cond, _) if cond.forall(_.deterministic) => | ||
j.copy(left = recursiveRemoveSort(originLeft), right = recursiveRemoveSort(originRight)) | ||
case g @ Aggregate(_, aggs, originChild) if isOrderIrrelevantAggs(aggs) => | ||
g.copy(child = recursiveRemoveSort(originChild)) | ||
} | ||
|
||
def recursiveRemoveSort(plan: LogicalPlan): LogicalPlan = plan match { | ||
private def recursiveRemoveSort(plan: LogicalPlan): LogicalPlan = plan match { | ||
case Sort(_, _, child) => recursiveRemoveSort(child) | ||
case other if canEliminateSort(other) => | ||
other.withNewChildren(other.children.map(recursiveRemoveSort)) | ||
case _ => plan | ||
} | ||
|
||
def canEliminateSort(plan: LogicalPlan): Boolean = plan match { | ||
private def canEliminateSort(plan: LogicalPlan): Boolean = plan match { | ||
case p: Project => p.projectList.forall(_.deterministic) | ||
case f: Filter => f.condition.deterministic | ||
case _ => false | ||
} | ||
|
||
private def isOrderIrrelevantAggs(aggs: Seq[NamedExpression]): Boolean = { | ||
def isOrderIrrelevantAggFunction(func: AggregateFunction): Boolean = func match { | ||
case _: Sum => true | ||
case _: Min => true | ||
case _: Max => true | ||
case _: Count => true | ||
case _: Average => true | ||
case _: CentralMomentAgg => true | ||
case _ => false | ||
} | ||
|
||
def checkValidAggregateExpression(expr: Expression): Boolean = expr match { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: private? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We cannot make it private because this is a nested function within There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ur, this is an inner func. I see. |
||
case _: AttributeReference => true | ||
case ae: AggregateExpression => isOrderIrrelevantAggFunction(ae.aggregateFunction) | ||
case _: UserDefinedExpression => false | ||
case e => e.children.forall(checkValidAggregateExpression) | ||
} | ||
|
||
aggs.forall(checkValidAggregateExpression) | ||
} | ||
} | ||
|
||
/** | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could still have a precision difference after eliminating the sort for the floating point data type. I am afraid some end users might prefer to adding a sort in these cases to ensure the results are consistent?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cc @maryannxue @cloud-fan
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah that's a good point. AVG over floating values is order sensitive. Not sure if this can really affect queries in practice, but better to be conservative here. @WangGuangxin can you fix it in a followup?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I'll fix it in a followup
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@WangGuangxin do you have no time for the follow-up now? Could I take this over?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#26534
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same will be true of all central moments, BTW
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I revisit this and have a small question. In fact
Avg
is transformed toSum
andCount
, so I think there should be no precision problem?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sum
is the issue itself, actually; see the followup PR