apache · WangGuangxin · Sep 26, 2019 · Oct 3, 2019 · Oct 4, 2019 · Oct 7, 2019
diff --git a/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala b/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
@@ -33,7 +33,8 @@ import org.apache.spark.sql.types._
        1.5
   """,
   since = "1.0.0")
-case class Average(child: Expression) extends DeclarativeAggregate with ImplicitCastInputTypes {
+case class Average(child: Expression) extends DeclarativeAggregate with ImplicitCastInputTypes
+  with OrderIrrelevantAggs {
 
   override def prettyName: String = "avg"
 

diff --git a/...src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala b/...src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.types._
  * @param child to compute central moments of.
  */
 abstract class CentralMomentAgg(child: Expression)
-  extends DeclarativeAggregate with ImplicitCastInputTypes {
+  extends DeclarativeAggregate with ImplicitCastInputTypes with OrderIrrelevantAggs {
 
   /**
    * The central moment order to be computed.

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala
@@ -41,7 +41,7 @@ import org.apache.spark.sql.types._
   """,
   since = "1.0.0")
 // scalastyle:on line.size.limit
-case class Count(children: Seq[Expression]) extends DeclarativeAggregate {
+case class Count(children: Seq[Expression]) extends DeclarativeAggregate with OrderIrrelevantAggs {
   override def nullable: Boolean = false
 
   // Return data type.

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types._
        50
   """,
   since = "1.0.0")
-case class Max(child: Expression) extends DeclarativeAggregate {
+case class Max(child: Expression) extends DeclarativeAggregate with OrderIrrelevantAggs {
 
   override def children: Seq[Expression] = child :: Nil
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types._
        -1
   """,
   since = "1.0.0")
-case class Min(child: Expression) extends DeclarativeAggregate {
+case class Min(child: Expression) extends DeclarativeAggregate with OrderIrrelevantAggs {
 
   override def children: Seq[Expression] = child :: Nil
 

diff --git a/.../main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/OrderIrrelevantAggs.scala b/.../main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/OrderIrrelevantAggs.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+/**
+ * An [[OrderIrrelevantAggs]] trait denotes those aggregate functions that its result
+ * has nothing to do with the order of input data.
+ * For example, [[Sum]] is [[OrderIrrelevantAggs]] while [[First]] is not.
+ */
+trait OrderIrrelevantAggs extends AggregateFunction {
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
@@ -36,7 +36,8 @@ import org.apache.spark.sql.types._
        NULL
   """,
   since = "1.0.0")
-case class Sum(child: Expression) extends DeclarativeAggregate with ImplicitCastInputTypes {
+case class Sum(child: Expression) extends DeclarativeAggregate with ImplicitCastInputTypes
+  with OrderIrrelevantAggs {
 
   override def children: Seq[Expression] = child :: Nil
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -175,7 +175,8 @@ abstract class Optimizer(catalogManager: CatalogManager)
     Batch("Join Reorder", FixedPoint(1),
       CostBasedJoinReorder) :+
     Batch("Remove Redundant Sorts", Once,
-      RemoveRedundantSorts) :+
+      RemoveRedundantSorts,
+      RemoveSortInSubquery) :+
     Batch("Decimal Optimizations", fixedPoint,
       DecimalAggregates) :+
     Batch("Object Expressions Optimization", fixedPoint,

diff --git a/...atalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveSortInSubquery.scala b/...atalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveSortInSubquery.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.expressions.{NamedExpression, PredicateHelper}
+import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, OrderIrrelevantAggs}
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules.Rule
+
+/**
+ * [[Sort]] without [[Limit]] in subquery is useless. For example,
+ *
+ * {{{
+ *   SELECT * FROM
+ *    (SELECT f1 FROM tbl1 ORDER BY f2) temp1
+ *   JOIN
+ *    (SELECT f3 FROM tbl2) temp2
+ *   ON temp1.f1 = temp2.f3
+ * }}}
+ *
+ * is equal to
+ *
+ * {{{
+ *  SELECT * FROM
+ *   (SELECT f1 FROM tbl1) temp1
+ *  JOIN
+ *   (SELECT f3 FROM tbl2) temp2
+ *  ON temp1.f1 = temp2.f3"
+ * }}}
+ *
+ * This rule try to remove this kind of [[Sort]] operator.
+ */
+object RemoveSortInSubquery extends Rule[LogicalPlan] with PredicateHelper {
+  private def removeTopLevelSort(plan: LogicalPlan): LogicalPlan = {
+    plan match {
+      case Sort(_, _, child) => child
+      case Project(fields, child) => Project(fields, removeTopLevelSort(child))
+      case other => other
+    }
+  }
+
+  private def isOrderIrrelevantAggs(aggs: Seq[NamedExpression]): Boolean = {
+    val aggExpressions = aggs.flatMap { e =>
+      e.collect {
+        case ae: AggregateExpression => ae
+      }
+    }
+
+    aggExpressions.forall(_.aggregateFunction.isInstanceOf[OrderIrrelevantAggs])
+  }
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case j @ Join(originLeft, originRight, _, _, _) =>
+      j.copy(left = removeTopLevelSort(originLeft), right = removeTopLevelSort(originRight))
+    case g @ Aggregate(_, aggs, originChild) if isOrderIrrelevantAggs(aggs) =>
+      g.copy(child = removeTopLevelSort(originChild))
+  }
+}
diff --git a/...st/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveSortInSubquerySuite.scala b/...st/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveSortInSubquerySuite.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.plans.{LeftOuter, PlanTest, RightOuter}
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+
+class RemoveSortInSubquerySuite extends PlanTest {
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("Limit PushDown", FixedPoint(10), LimitPushDown) ::
+        Batch("Remove Redundant Sorts", Once, RemoveSortInSubquery) :: Nil
+  }
+
+  object PushDownOptimizer extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("Limit PushDown", FixedPoint(10), LimitPushDown) :: Nil
+  }
+
+  val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
+  val testRelationB = LocalRelation('d.int)
+
+  test("remove orderBy in groupBy subquery with count aggs") {
+    val projectPlan = testRelation.select('a, 'b)
+    val unnecessaryOrderByPlan = projectPlan.orderBy('a.asc, 'b.desc)
+    val groupByPlan = unnecessaryOrderByPlan.groupBy('a)(count(1))
+    val optimized = Optimize.execute(groupByPlan.analyze)
+    val correctAnswer = projectPlan.groupBy('a)(count(1)).analyze
+    comparePlans(Optimize.execute(optimized), correctAnswer)
+  }
+
+  test("remove orderBy in groupBy subquery with sum aggs") {
+    val projectPlan = testRelation.select('a, 'b)
+    val unnecessaryOrderByPlan = projectPlan.orderBy('a.asc, 'b.desc)
+    val groupByPlan = unnecessaryOrderByPlan.groupBy('a)(sum('a))
+    val optimized = Optimize.execute(groupByPlan.analyze)
+    val correctAnswer = projectPlan.groupBy('a)(sum('a)).analyze
+    comparePlans(Optimize.execute(optimized), correctAnswer)
+  }
+
+  test("remove orderBy in groupBy subquery with first aggs") {
+    val projectPlan = testRelation.select('a, 'b)
+    val orderByPlan = projectPlan.orderBy('a.asc, 'b.desc)
+    val groupByPlan = orderByPlan.groupBy('a)(first('a))
+    val optimized = Optimize.execute(groupByPlan.analyze)
+    val correctAnswer = groupByPlan.analyze
+    comparePlans(Optimize.execute(optimized), correctAnswer)
+  }
+
+  test("remove orderBy in groupBy subquery with first and count aggs") {
+    val projectPlan = testRelation.select('a, 'b)
+    val orderByPlan = projectPlan.orderBy('a.asc, 'b.desc)
+    val groupByPlan = orderByPlan.groupBy('a)(first('a), count(1))
+    val optimized = Optimize.execute(groupByPlan.analyze)
+    val correctAnswer = groupByPlan.analyze
+    comparePlans(Optimize.execute(optimized), correctAnswer)
+  }
+
+  test("should not remove orderBy with limit in groupBy subquery") {
+    val projectPlan = testRelation.select('a, 'b)
+    val orderByPlan = projectPlan.orderBy('a.asc, 'b.desc).limit(10)
+    val groupByPlan = orderByPlan.groupBy('a)(count(1))
+    val optimized = Optimize.execute(groupByPlan.analyze)
+    val correctAnswer = groupByPlan.analyze
+    comparePlans(Optimize.execute(optimized), correctAnswer)
+  }
+
+  test("remove orderBy in join subquery") {
+    val projectPlan = testRelation.select('a, 'b)
+    val unnecessaryOrderByPlan = projectPlan.orderBy('a.asc, 'b.desc)
+    val projectPlanB = testRelationB.select('d)
+    val joinPlan = unnecessaryOrderByPlan.join(projectPlanB).select('a, 'd)
+    val optimized = Optimize.execute(joinPlan.analyze)
+    val correctAnswer = projectPlan.join(projectPlanB).select('a, 'd).analyze
+    comparePlans(Optimize.execute(optimized), correctAnswer)
+  }
+
+  test("should not remove orderBy with limit in join subquery") {
+    val projectPlan = testRelation.select('a, 'b)
+    val orderByPlan = projectPlan.orderBy('a.asc, 'b.desc).limit(10)
+    val projectPlanB = testRelationB.select('d)
+    val joinPlan = orderByPlan.join(projectPlanB).select('a, 'd)
+    val optimized = Optimize.execute(joinPlan.analyze)
+    val correctAnswer = joinPlan.analyze
+    comparePlans(Optimize.execute(optimized), correctAnswer)
+  }
+
+  test("should not remove orderBy in left join subquery if there is an outer limit") {
+    val projectPlan = testRelation.select('a, 'b)
+    val orderByPlan = projectPlan.orderBy('a.asc, 'b.desc)
+    val projectPlanB = testRelationB.select('d)
+    val joinPlan = orderByPlan
+      .join(projectPlanB, LeftOuter)
+      .limit(10)
+    val optimized = Optimize.execute(joinPlan.analyze)
+    val correctAnswer = PushDownOptimizer.execute(joinPlan.analyze)
+    comparePlans(Optimize.execute(optimized), correctAnswer)
+  }
+
+  test("remove orderBy in right join subquery event if there is an outer limit") {
+    val projectPlan = testRelation.select('a, 'b)
+    val orderByPlan = projectPlan.orderBy('a.asc, 'b.desc)
+    val projectPlanB = testRelationB.select('d)
+    val joinPlan = orderByPlan
+      .join(projectPlanB, RightOuter)
+      .limit(10)
+    val optimized = Optimize.execute(joinPlan.analyze)
+    val noOrderByPlan = projectPlan
+      .join(projectPlanB, RightOuter)
+      .limit(10)
+    val correctAnswer = PushDownOptimizer.execute(noOrderByPlan.analyze)
+    comparePlans(Optimize.execute(optimized), correctAnswer)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -1080,9 +1080,8 @@ class SubquerySuite extends QueryTest with SharedSparkSession {
            |                    HAVING max(c2) > 0
            |                    ORDER  BY c1)
         """.stripMargin
-      // The rule to remove redundant sorts is not able to remove the inner sort under
-      // an Aggregate operator. We only remove the top level sort.
-      assert(getNumSortsInQuery(query6) == 1)
+
+      assert(getNumSortsInQuery(query6) == 0)
 
       // Cases when sort is not removed from the plan
       // Limit on top of sort