apache · Dandandan · May 16, 2021 · May 11, 2021 · May 11, 2021 · May 11, 2021
diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto
@@ -396,6 +396,7 @@ message ProjectionExecNode {
 enum AggregateMode {
   PARTIAL = 0;
   FINAL = 1;
+  FINAL_PARTITIONED = 2;
 }
 
 message HashAggregateExecNode {

diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs
@@ -199,6 +199,9 @@ impl TryInto<Arc<dyn ExecutionPlan>> for &protobuf::PhysicalPlanNode {
                 let agg_mode: AggregateMode = match mode {
                     protobuf::AggregateMode::Partial => AggregateMode::Partial,
                     protobuf::AggregateMode::Final => AggregateMode::Final,
+                    protobuf::AggregateMode::FinalPartitioned => {
+                        AggregateMode::FinalPartitioned
+                    }
                 };
 
                 let group = hash_agg

diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs
@@ -172,6 +172,9 @@ impl TryInto<protobuf::PhysicalPlanNode> for Arc<dyn ExecutionPlan> {
             let agg_mode = match exec.mode() {
                 AggregateMode::Partial => protobuf::AggregateMode::Partial,
                 AggregateMode::Final => protobuf::AggregateMode::Final,
+                AggregateMode::FinalPartitioned => {
+                    protobuf::AggregateMode::FinalPartitioned
+                }
             };
             let input_schema = exec.input_schema();
             let input: protobuf::PhysicalPlanNode = exec.input().to_owned().try_into()?;

diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs
@@ -322,6 +322,7 @@ pub fn create_datafusion_context() -> ExecutionContext {
     let config = ExecutionConfig::new()
         .with_concurrency(1)
         .with_repartition_joins(false)
+        .with_repartition_aggregations(false)
         .with_physical_optimizer_rules(rules);
     ExecutionContext::with_config(config)
 }
diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs
@@ -128,7 +128,7 @@ impl DistributedPlanner {
             //TODO should insert query stages in more generic way based on partitioning metadata
             // and not specifically for this operator
             match agg.mode() {
-                AggregateMode::Final => {
+                AggregateMode::Final | AggregateMode::FinalPartitioned => {
                     let mut new_children: Vec<Arc<dyn ExecutionPlan>> = vec![];
                     for child in &children {
                         let new_stage = create_query_stage(
@@ -237,10 +237,9 @@ mod test {
     use ballista_core::serde::protobuf;
     use ballista_core::utils::format_plan;
     use datafusion::physical_plan::hash_aggregate::HashAggregateExec;
-    use datafusion::physical_plan::merge::MergeExec;
-    use datafusion::physical_plan::projection::ProjectionExec;
     use datafusion::physical_plan::sort::SortExec;
     use datafusion::physical_plan::ExecutionPlan;
+    use datafusion::physical_plan::{merge::MergeExec, projection::ProjectionExec};
     use std::convert::TryInto;
     use std::sync::Arc;
     use uuid::Uuid;
@@ -278,11 +277,9 @@ mod test {
         QueryStageExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=1
          HashAggregateExec: groupBy=["l_returnflag"], aggrExpr=["SUM(l_extendedprice Multiply Int64(1)) [\"l_extendedprice * CAST(1 AS Float64)\"]"]
           CsvExec: testdata/lineitem; partitions=2
-
         QueryStageExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=2
          MergeExec
           UnresolvedShuffleExec: stages=[1]
-
         QueryStageExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=3
          SortExec { input: ProjectionExec { expr: [(Column { name: "l_returnflag" }, "l_returnflag"), (Column { name: "SUM(l_ext
           ProjectionExec { expr: [(Column { name: "l_returnflag" }, "l_returnflag"), (Column { name: "SUM(l_extendedprice Multip

diff --git a/ballista/rust/scheduler/src/test_utils.rs b/ballista/rust/scheduler/src/test_utils.rs
@@ -33,10 +33,12 @@ pub const TPCH_TABLES: &[&str] = &[
 pub fn datafusion_test_context(path: &str) -> Result<ExecutionContext> {
     // remove Repartition rule because that isn't supported yet
     let rules: Vec<Arc<dyn PhysicalOptimizerRule + Send + Sync>> = vec![
-        Arc::new(CoalesceBatches::new()),
         Arc::new(AddMergeExec::new()),
+        Arc::new(CoalesceBatches::new()),
     ];
-    let config = ExecutionConfig::new().with_physical_optimizer_rules(rules);
+    let config = ExecutionConfig::new()
+        .with_physical_optimizer_rules(rules)
+        .with_repartition_aggregations(false);
     let mut ctx = ExecutionContext::with_config(config);
 
     for table in TPCH_TABLES {

diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs
@@ -628,6 +628,9 @@ pub struct ExecutionConfig {
     /// Should DataFusion repartition data using the join keys to execute joins in parallel
     /// using the provided `concurrency` level
     pub repartition_joins: bool,
+    /// Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel
+    /// using the provided `concurrency` level
+    pub repartition_aggregations: bool,
 }
 
 impl ExecutionConfig {
@@ -655,6 +658,7 @@ impl ExecutionConfig {
             create_default_catalog_and_schema: true,
             information_schema: false,
             repartition_joins: true,
+            repartition_aggregations: true,
         }
     }
 
@@ -738,6 +742,11 @@ impl ExecutionConfig {
         self.repartition_joins = enabled;
         self
     }
+    /// Enables or disables the use of repartitioning for aggregations to improve parallelism
+    pub fn with_repartition_aggregations(mut self, enabled: bool) -> Self {
+        self.repartition_aggregations = enabled;
+        self
+    }
 }
 
 /// Execution context for registering data sources and executing queries
@@ -1305,7 +1314,6 @@ mod tests {
     #[tokio::test]
     async fn aggregate_grouped() -> Result<()> {
         let results = execute("SELECT c1, SUM(c2) FROM test GROUP BY c1", 4).await?;
-        assert_eq!(results.len(), 1);
 
         let expected = vec![
             "+----+---------+",
@@ -1325,7 +1333,6 @@ mod tests {
     #[tokio::test]
     async fn aggregate_grouped_avg() -> Result<()> {
         let results = execute("SELECT c1, AVG(c2) FROM test GROUP BY c1", 4).await?;
-        assert_eq!(results.len(), 1);
 
         let expected = vec![
             "+----+---------+",
@@ -1346,7 +1353,6 @@ mod tests {
     async fn boolean_literal() -> Result<()> {
         let results =
             execute("SELECT c1, c3 FROM test WHERE c1 > 2 AND c3 = true", 4).await?;
-        assert_eq!(results.len(), 1);
 
         let expected = vec![
             "+----+------+",
@@ -1368,7 +1374,6 @@ mod tests {
     async fn aggregate_grouped_empty() -> Result<()> {
         let results =
             execute("SELECT c1, AVG(c2) FROM test WHERE c1 = 123 GROUP BY c1", 4).await?;
-        assert_eq!(results.len(), 1);
 
         let expected = vec!["++", "||", "++", "++"];
         assert_batches_sorted_eq!(expected, &results);
@@ -1379,7 +1384,6 @@ mod tests {
     #[tokio::test]
     async fn aggregate_grouped_max() -> Result<()> {
         let results = execute("SELECT c1, MAX(c2) FROM test GROUP BY c1", 4).await?;
-        assert_eq!(results.len(), 1);
 
         let expected = vec![
             "+----+---------+",
@@ -1399,7 +1403,6 @@ mod tests {
     #[tokio::test]
     async fn aggregate_grouped_min() -> Result<()> {
         let results = execute("SELECT c1, MIN(c2) FROM test GROUP BY c1", 4).await?;
-        assert_eq!(results.len(), 1);
 
         let expected = vec![
             "+----+---------+",
@@ -1583,7 +1586,6 @@ mod tests {
     #[tokio::test]
     async fn count_aggregated() -> Result<()> {
         let results = execute("SELECT c1, COUNT(c2) FROM test GROUP BY c1", 4).await?;
-        assert_eq!(results.len(), 1);
 
         let expected = vec![
             "+----+-----------+",
@@ -1635,7 +1637,6 @@ mod tests {
             &mut ctx,
             "SELECT date_trunc('week', t1) as week, SUM(c2) FROM test GROUP BY date_trunc('week', t1)"
         ).await?;
-        assert_eq!(results.len(), 1);
 
         let expected = vec![
             "+---------------------+---------+",
@@ -1879,7 +1880,6 @@ mod tests {
         ];
 
         let results = run_count_distinct_integers_aggregated_scenario(partitions).await?;
-        assert_eq!(results.len(), 1);
 
         let expected = vec!
 [
@@ -1907,7 +1907,6 @@ mod tests {
         ];
 
         let results = run_count_distinct_integers_aggregated_scenario(partitions).await?;
-        assert_eq!(results.len(), 1);
 
         let expected = vec![
     "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+",

diff --git a/datafusion/src/physical_optimizer/merge_exec.rs b/datafusion/src/physical_optimizer/merge_exec.rs
@@ -52,6 +52,7 @@ impl PhysicalOptimizerRule for AddMergeExec {
                 .collect::<Result<Vec<_>>>()?;
             match plan.required_child_distribution() {
                 Distribution::UnspecifiedDistribution => plan.with_new_children(children),
+                Distribution::HashPartitioned(_) => plan.with_new_children(children),
                 Distribution::SinglePartition => plan.with_new_children(
                     children
                         .iter()

diff --git a/datafusion/src/physical_optimizer/repartition.rs b/datafusion/src/physical_optimizer/repartition.rs
@@ -52,7 +52,10 @@ fn optimize_concurrency(
             .map(|child| {
                 optimize_concurrency(
                     concurrency,
-                    plan.required_child_distribution() == Distribution::SinglePartition,
+                    matches!(
+                        plan.required_child_distribution(),
+                        Distribution::SinglePartition
+                    ),
                     child.clone(),
                 )
             })

diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs
@@ -76,6 +76,13 @@ pub enum AggregateMode {
     Partial,
     /// Final aggregate that produces a single partition of output
     Final,
+    /// Final aggregate that works on pre-partitioned data.
+    ///
+    /// This requires the invariant that all rows with a particular
+    /// grouping key are in the same partitions, such as is the case
+    /// with Hash repartitioning on the group keys. If a group key is
+    /// duplicated, duplicate groups would be produced
+    FinalPartitioned,
 }
 
 /// Hash aggregate execution plan
@@ -121,7 +128,7 @@ fn create_schema(
                 fields.extend(expr.state_fields()?.iter().cloned())
             }
         }
-        AggregateMode::Final => {
+        AggregateMode::Final | AggregateMode::FinalPartitioned => {
             // in final mode, the field with the final result of the accumulator
             for expr in aggr_expr {
                 fields.push(expr.field()?)
@@ -202,6 +209,9 @@ impl ExecutionPlan for HashAggregateExec {
     fn required_child_distribution(&self) -> Distribution {
         match &self.mode {
             AggregateMode::Partial => Distribution::UnspecifiedDistribution,
+            AggregateMode::FinalPartitioned => Distribution::HashPartitioned(
+                self.group_expr.iter().map(|x| x.0.clone()).collect(),
+            ),
             AggregateMode::Final => Distribution::SinglePartition,
         }
     }
@@ -419,7 +429,7 @@ fn group_aggregate_batch(
                 })
                 .try_for_each(|(accumulator, values)| match mode {
                     AggregateMode::Partial => accumulator.update_batch(&values),
-                    AggregateMode::Final => {
+                    AggregateMode::FinalPartitioned | AggregateMode::Final => {
                         // note: the aggregation here is over states, not values, thus the merge
                         accumulator.merge_batch(&values)
                     }
@@ -772,7 +782,7 @@ fn aggregate_expressions(
             Ok(aggr_expr.iter().map(|agg| agg.expressions()).collect())
         }
         // in this mode, we build the merge expressions of the aggregation
-        AggregateMode::Final => Ok(aggr_expr
+        AggregateMode::Final | AggregateMode::FinalPartitioned => Ok(aggr_expr
             .iter()
             .map(|agg| merge_expressions(agg))
             .collect::<Result<Vec<_>>>()?),
@@ -866,7 +876,9 @@ fn aggregate_batch(
             // 1.3
             match mode {
                 AggregateMode::Partial => accum.update_batch(values),
-                AggregateMode::Final => accum.merge_batch(values),
+                AggregateMode::Final | AggregateMode::FinalPartitioned => {
+                    accum.merge_batch(values)
+                }
             }
         })
 }
@@ -1039,7 +1051,7 @@ fn finalize_aggregation(
                 .collect::<Result<Vec<_>>>()?;
             Ok(a.iter().flatten().cloned().collect::<Vec<_>>())
         }
-        AggregateMode::Final => {
+        AggregateMode::Final | AggregateMode::FinalPartitioned => {
             // merge the state to the final value
             accumulators
                 .iter()