-
Notifications
You must be signed in to change notification settings - Fork 1.8k
feat: integrate batch coalescer with repartition exec #19002
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -30,6 +30,7 @@ use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; | |||||||||||||
| use super::{ | ||||||||||||||
| DisplayAs, ExecutionPlanProperties, RecordBatchStream, SendableRecordBatchStream, | ||||||||||||||
| }; | ||||||||||||||
| use crate::coalesce::LimitedBatchCoalescer; | ||||||||||||||
| use crate::execution_plan::{CardinalityEffect, EvaluationType, SchedulingType}; | ||||||||||||||
| use crate::hash_utils::create_hashes; | ||||||||||||||
| use crate::metrics::{BaselineMetrics, SpillMetrics}; | ||||||||||||||
|
|
@@ -932,6 +933,7 @@ impl ExecutionPlan for RepartitionExec { | |||||||||||||
| spill_stream, | ||||||||||||||
| 1, // Each receiver handles one input partition | ||||||||||||||
| BaselineMetrics::new(&metrics, partition), | ||||||||||||||
| None, // subsequent merge sort already does batching https://github.com/apache/datafusion/blob/e4dcf0c85611ad0bd291f03a8e03fe56d773eb16/datafusion/physical-plan/src/sorts/merge.rs#L286 | ||||||||||||||
| )) as SendableRecordBatchStream | ||||||||||||||
| }) | ||||||||||||||
| .collect::<Vec<_>>(); | ||||||||||||||
|
|
@@ -970,6 +972,7 @@ impl ExecutionPlan for RepartitionExec { | |||||||||||||
| spill_stream, | ||||||||||||||
| num_input_partitions, | ||||||||||||||
| BaselineMetrics::new(&metrics, partition), | ||||||||||||||
| Some(context.session_config().batch_size()), | ||||||||||||||
| )) as SendableRecordBatchStream) | ||||||||||||||
| } | ||||||||||||||
| }) | ||||||||||||||
|
|
@@ -1427,9 +1430,12 @@ struct PerPartitionStream { | |||||||||||||
|
|
||||||||||||||
| /// Execution metrics | ||||||||||||||
| baseline_metrics: BaselineMetrics, | ||||||||||||||
|
|
||||||||||||||
| batch_coalescer: Option<LimitedBatchCoalescer>, | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| impl PerPartitionStream { | ||||||||||||||
| #[allow(clippy::too_many_arguments)] | ||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
By using
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for pointing out, will update in a revision later. |
||||||||||||||
| fn new( | ||||||||||||||
| schema: SchemaRef, | ||||||||||||||
| receiver: DistributionReceiver<MaybeBatch>, | ||||||||||||||
|
|
@@ -1438,7 +1444,10 @@ impl PerPartitionStream { | |||||||||||||
| spill_stream: SendableRecordBatchStream, | ||||||||||||||
| num_input_partitions: usize, | ||||||||||||||
| baseline_metrics: BaselineMetrics, | ||||||||||||||
| batch_size: Option<usize>, | ||||||||||||||
| ) -> Self { | ||||||||||||||
| let batch_coalescer = | ||||||||||||||
| batch_size.map(|s| LimitedBatchCoalescer::new(Arc::clone(&schema), s, None)); | ||||||||||||||
| Self { | ||||||||||||||
| schema, | ||||||||||||||
| receiver, | ||||||||||||||
|
|
@@ -1448,6 +1457,7 @@ impl PerPartitionStream { | |||||||||||||
| state: StreamState::ReadingMemory, | ||||||||||||||
| remaining_partitions: num_input_partitions, | ||||||||||||||
| baseline_metrics, | ||||||||||||||
| batch_coalescer, | ||||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
|
|
@@ -1540,7 +1550,46 @@ impl Stream for PerPartitionStream { | |||||||||||||
| mut self: Pin<&mut Self>, | ||||||||||||||
| cx: &mut Context<'_>, | ||||||||||||||
| ) -> Poll<Option<Self::Item>> { | ||||||||||||||
| let poll = self.poll_next_inner(cx); | ||||||||||||||
| let poll = match self.batch_coalescer.take() { | ||||||||||||||
| Some(mut coalescer) => { | ||||||||||||||
| let cloned_time = self.baseline_metrics.elapsed_compute().clone(); | ||||||||||||||
| let mut completed = false; | ||||||||||||||
| let poll; | ||||||||||||||
| loop { | ||||||||||||||
| if let Some(batch) = coalescer.next_completed_batch() { | ||||||||||||||
| poll = Poll::Ready(Some(Ok(batch))); | ||||||||||||||
| break; | ||||||||||||||
| } | ||||||||||||||
| if completed { | ||||||||||||||
| poll = Poll::Ready(None); | ||||||||||||||
| break; | ||||||||||||||
| } | ||||||||||||||
| let inner_poll = self.poll_next_inner(cx); | ||||||||||||||
| let _timer = cloned_time.timer(); | ||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this be before the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My understanding was that |
||||||||||||||
|
|
||||||||||||||
| match inner_poll { | ||||||||||||||
| Poll::Pending => { | ||||||||||||||
| poll = Poll::Pending; | ||||||||||||||
| break; | ||||||||||||||
| } | ||||||||||||||
| Poll::Ready(None) => { | ||||||||||||||
| completed = true; | ||||||||||||||
| coalescer.finish()?; | ||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Otherwise in case of an error the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think in this case, the loop would continue and then |
||||||||||||||
| } | ||||||||||||||
| Poll::Ready(Some(Ok(batch))) => { | ||||||||||||||
| coalescer.push_batch(batch)?; | ||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to reply above, I think in this case, the loop would continue and then break on line 1561 or 1565. Then line 1588 is reached, which restores the |
||||||||||||||
| } | ||||||||||||||
| Poll::Ready(Some(err)) => { | ||||||||||||||
| poll = Poll::Ready(Some(err)); | ||||||||||||||
| break; | ||||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
| self.batch_coalescer = Some(coalescer); | ||||||||||||||
| poll | ||||||||||||||
| } | ||||||||||||||
| None => self.poll_next_inner(cx), | ||||||||||||||
| }; | ||||||||||||||
| self.baseline_metrics.record_poll(poll) | ||||||||||||||
| } | ||||||||||||||
| } | ||||||||||||||
|
|
@@ -1577,7 +1626,6 @@ mod tests { | |||||||||||||
| use datafusion_common_runtime::JoinSet; | ||||||||||||||
| use datafusion_execution::runtime_env::RuntimeEnvBuilder; | ||||||||||||||
| use insta::assert_snapshot; | ||||||||||||||
| use itertools::Itertools; | ||||||||||||||
|
|
||||||||||||||
| #[tokio::test] | ||||||||||||||
| async fn one_to_many_round_robin() -> Result<()> { | ||||||||||||||
|
|
@@ -1591,10 +1639,13 @@ mod tests { | |||||||||||||
| repartition(&schema, partitions, Partitioning::RoundRobinBatch(4)).await?; | ||||||||||||||
|
|
||||||||||||||
| assert_eq!(4, output_partitions.len()); | ||||||||||||||
| assert_eq!(13, output_partitions[0].len()); | ||||||||||||||
| assert_eq!(13, output_partitions[1].len()); | ||||||||||||||
| assert_eq!(12, output_partitions[2].len()); | ||||||||||||||
| assert_eq!(12, output_partitions[3].len()); | ||||||||||||||
| for partition in &output_partitions { | ||||||||||||||
| assert_eq!(1, partition.len()); | ||||||||||||||
| } | ||||||||||||||
| assert_eq!(13 * 8, output_partitions[0][0].num_rows()); | ||||||||||||||
| assert_eq!(13 * 8, output_partitions[1][0].num_rows()); | ||||||||||||||
| assert_eq!(12 * 8, output_partitions[2][0].num_rows()); | ||||||||||||||
| assert_eq!(12 * 8, output_partitions[3][0].num_rows()); | ||||||||||||||
|
|
||||||||||||||
| Ok(()) | ||||||||||||||
| } | ||||||||||||||
|
|
@@ -1611,7 +1662,7 @@ mod tests { | |||||||||||||
| repartition(&schema, partitions, Partitioning::RoundRobinBatch(1)).await?; | ||||||||||||||
|
|
||||||||||||||
| assert_eq!(1, output_partitions.len()); | ||||||||||||||
| assert_eq!(150, output_partitions[0].len()); | ||||||||||||||
| assert_eq!(150 * 8, output_partitions[0][0].num_rows()); | ||||||||||||||
|
|
||||||||||||||
| Ok(()) | ||||||||||||||
| } | ||||||||||||||
|
|
@@ -1627,12 +1678,12 @@ mod tests { | |||||||||||||
| let output_partitions = | ||||||||||||||
| repartition(&schema, partitions, Partitioning::RoundRobinBatch(5)).await?; | ||||||||||||||
|
|
||||||||||||||
| let total_rows_per_partition = 8 * 50 * 3 / 5; | ||||||||||||||
| assert_eq!(5, output_partitions.len()); | ||||||||||||||
| assert_eq!(30, output_partitions[0].len()); | ||||||||||||||
| assert_eq!(30, output_partitions[1].len()); | ||||||||||||||
| assert_eq!(30, output_partitions[2].len()); | ||||||||||||||
| assert_eq!(30, output_partitions[3].len()); | ||||||||||||||
| assert_eq!(30, output_partitions[4].len()); | ||||||||||||||
| for partition in output_partitions { | ||||||||||||||
| assert_eq!(1, partition.len()); | ||||||||||||||
| assert_eq!(total_rows_per_partition, partition[0].num_rows()); | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| Ok(()) | ||||||||||||||
| } | ||||||||||||||
|
|
@@ -1707,12 +1758,12 @@ mod tests { | |||||||||||||
|
|
||||||||||||||
| let output_partitions = handle.join().await.unwrap().unwrap(); | ||||||||||||||
|
|
||||||||||||||
| let total_rows_per_partition = 8 * 50 * 3 / 5; | ||||||||||||||
| assert_eq!(5, output_partitions.len()); | ||||||||||||||
| assert_eq!(30, output_partitions[0].len()); | ||||||||||||||
| assert_eq!(30, output_partitions[1].len()); | ||||||||||||||
| assert_eq!(30, output_partitions[2].len()); | ||||||||||||||
| assert_eq!(30, output_partitions[3].len()); | ||||||||||||||
| assert_eq!(30, output_partitions[4].len()); | ||||||||||||||
| for partition in output_partitions { | ||||||||||||||
| assert_eq!(1, partition.len()); | ||||||||||||||
| assert_eq!(total_rows_per_partition, partition[0].num_rows()); | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| Ok(()) | ||||||||||||||
| } | ||||||||||||||
|
|
@@ -1950,14 +2001,13 @@ mod tests { | |||||||||||||
| }); | ||||||||||||||
| let batches_with_drop = crate::common::collect(output_stream1).await.unwrap(); | ||||||||||||||
|
|
||||||||||||||
| fn sort(batch: Vec<RecordBatch>) -> Vec<RecordBatch> { | ||||||||||||||
| batch | ||||||||||||||
| .into_iter() | ||||||||||||||
| .sorted_by_key(|b| format!("{b:?}")) | ||||||||||||||
| .collect() | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| assert_eq!(sort(batches_without_drop), sort(batches_with_drop)); | ||||||||||||||
| let items_vec_with_drop = str_batches_to_vec(&batches_with_drop); | ||||||||||||||
| let items_set_with_drop: HashSet<&str> = | ||||||||||||||
| items_vec_with_drop.iter().copied().collect(); | ||||||||||||||
| assert_eq!( | ||||||||||||||
| items_set_with_drop.symmetric_difference(&items_set).count(), | ||||||||||||||
| 0 | ||||||||||||||
| ); | ||||||||||||||
| } | ||||||||||||||
|
|
||||||||||||||
| fn str_batches_to_vec(batches: &[RecordBatch]) -> Vec<&str> { | ||||||||||||||
|
|
||||||||||||||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please see #18782 (comment) for my thoughts/reason on updating this test.