-
Notifications
You must be signed in to change notification settings - Fork 1.9k
feat: impl handle_child_pushdown_result for UnionExec
#20145
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
683a551
4d54b9a
e32303f
4694935
ded7d9b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1808,6 +1808,67 @@ fn test_filter_pushdown_through_union() { | |
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_filter_pushdown_through_union_mixed_support() { | ||
| // Test case where one child supports filter pushdown and one doesn't | ||
| let scan1 = TestScanBuilder::new(schema()).with_support(true).build(); | ||
| let scan2 = TestScanBuilder::new(schema()).with_support(false).build(); | ||
|
|
||
| let union = UnionExec::try_new(vec![scan1, scan2]).unwrap(); | ||
|
|
||
| let predicate = col_lit_predicate("a", "foo", &schema()); | ||
| let plan = Arc::new(FilterExec::try_new(predicate, union).unwrap()); | ||
|
|
||
| insta::assert_snapshot!( | ||
| OptimizationTest::new(plan, FilterPushdown::new(), true), | ||
| @r" | ||
| OptimizationTest: | ||
| input: | ||
| - FilterExec: a@0 = foo | ||
| - UnionExec | ||
| - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true | ||
| - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false | ||
| output: | ||
| Ok: | ||
| - UnionExec | ||
| - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=a@0 = foo | ||
| - FilterExec: a@0 = foo | ||
| - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false | ||
| " | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_filter_pushdown_through_union_does_not_support() { | ||
| // Test case where one child supports filter pushdown and one doesn't | ||
| let scan1 = TestScanBuilder::new(schema()).with_support(false).build(); | ||
| let scan2 = TestScanBuilder::new(schema()).with_support(false).build(); | ||
|
|
||
| let union = UnionExec::try_new(vec![scan1, scan2]).unwrap(); | ||
|
|
||
| let predicate = col_lit_predicate("a", "foo", &schema()); | ||
| let plan = Arc::new(FilterExec::try_new(predicate, union).unwrap()); | ||
|
|
||
| insta::assert_snapshot!( | ||
| OptimizationTest::new(plan, FilterPushdown::new(), true), | ||
| @" | ||
| OptimizationTest: | ||
| input: | ||
| - FilterExec: a@0 = foo | ||
| - UnionExec | ||
| - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false | ||
| - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false | ||
| output: | ||
| Ok: | ||
| - UnionExec | ||
| - FilterExec: a@0 = foo | ||
| - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false | ||
| - FilterExec: a@0 = foo | ||
| - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=false | ||
|
Comment on lines
+1855
to
+1867
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think even for this case, pushdown filter do not have some bad effect. |
||
| " | ||
| ); | ||
| } | ||
|
|
||
| /// Schema: | ||
| /// a: String | ||
| /// b: String | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,7 +36,11 @@ use crate::execution_plan::{ | |
| InvariantLevel, boundedness_from_children, check_default_invariants, | ||
| emission_type_from_children, | ||
| }; | ||
| use crate::filter_pushdown::{FilterDescription, FilterPushdownPhase}; | ||
| use crate::filter::FilterExec; | ||
| use crate::filter_pushdown::{ | ||
| ChildPushdownResult, FilterDescription, FilterPushdownPhase, | ||
| FilterPushdownPropagation, PushedDown, | ||
| }; | ||
| use crate::metrics::BaselineMetrics; | ||
| use crate::projection::{ProjectionExec, make_with_child}; | ||
| use crate::stream::ObservedStream; | ||
|
|
@@ -49,7 +53,9 @@ use datafusion_common::{ | |
| Result, assert_or_internal_err, exec_err, internal_datafusion_err, | ||
| }; | ||
| use datafusion_execution::TaskContext; | ||
| use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr, calculate_union}; | ||
| use datafusion_physical_expr::{ | ||
| EquivalenceProperties, PhysicalExpr, calculate_union, conjunction, | ||
| }; | ||
|
|
||
| use futures::Stream; | ||
| use itertools::Itertools; | ||
|
|
@@ -370,6 +376,80 @@ impl ExecutionPlan for UnionExec { | |
| ) -> Result<FilterDescription> { | ||
| FilterDescription::from_children(parent_filters, &self.children()) | ||
| } | ||
|
|
||
| fn handle_child_pushdown_result( | ||
| &self, | ||
| phase: FilterPushdownPhase, | ||
| child_pushdown_result: ChildPushdownResult, | ||
| _config: &ConfigOptions, | ||
| ) -> Result<FilterPushdownPropagation<Arc<dyn ExecutionPlan>>> { | ||
| // For non-Pre phase, use default behavior | ||
| if !matches!(phase, FilterPushdownPhase::Pre) { | ||
|
Comment on lines
+386
to
+387
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does the Pre / Post phase need different behavior?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i'm thinking the purpose for this pr can only happen in the pre phase, post phase is for dynamic filter, seem like not related, so i keep the default behavior
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. The intuition here is to let the creator of the filter decide what to do with it. But we should add a comment explaining this. |
||
| return Ok(FilterPushdownPropagation::if_all(child_pushdown_result)); | ||
| } | ||
|
|
||
| // UnionExec needs specialized filter pushdown handling when children have | ||
| // heterogeneous pushdown support. Without this, when some children support | ||
| // pushdown and others don't, the default behavior would leave FilterExec | ||
| // above UnionExec, re-applying filters to outputs of all children—including | ||
| // those that already applied the filters via pushdown. This specialized | ||
| // implementation adds FilterExec only to children that don't support | ||
| // pushdown, avoiding redundant filtering and improving performance. | ||
| // | ||
| // Example: Given Child1 (no pushdown support) and Child2 (has pushdown support) | ||
| // Default behavior: This implementation: | ||
| // FilterExec UnionExec | ||
| // UnionExec FilterExec | ||
| // Child1 Child1 | ||
| // Child2(filter) Child2(filter) | ||
|
|
||
| // Collect unsupported filters for each child | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In principle we could make Help me understand why we are adding the FilterExec here. i.e. no changes to the plan, not incorrect but we are applying filters to the output of Child2 that are unnecessary (it already applied these filters) With this logic we get: Which skips re-applying filters to the output of Child2. Is this interpretation correct?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct, thank you for such good explain👍
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add something along these lines as a comment justifying the added complexity?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add in ded7d9b |
||
| let mut unsupported_filters_per_child = vec![Vec::new(); self.inputs.len()]; | ||
| for parent_filter_result in child_pushdown_result.parent_filters.iter() { | ||
| for (child_idx, &child_result) in | ||
| parent_filter_result.child_results.iter().enumerate() | ||
| { | ||
| if matches!(child_result, PushedDown::No) { | ||
| unsupported_filters_per_child[child_idx] | ||
| .push(Arc::clone(&parent_filter_result.filter)); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Wrap children that have unsupported filters with FilterExec | ||
| let mut new_children = self.inputs.clone(); | ||
| for (child_idx, unsupported_filters) in | ||
| unsupported_filters_per_child.iter().enumerate() | ||
| { | ||
| if !unsupported_filters.is_empty() { | ||
| let combined_filter = conjunction(unsupported_filters.clone()); | ||
| new_children[child_idx] = Arc::new(FilterExec::try_new( | ||
| combined_filter, | ||
| Arc::clone(&self.inputs[child_idx]), | ||
| )?); | ||
| } | ||
| } | ||
|
|
||
| // Check if any children were modified | ||
| let children_modified = new_children | ||
| .iter() | ||
| .zip(self.inputs.iter()) | ||
| .any(|(new, old)| !Arc::ptr_eq(new, old)); | ||
|
|
||
| let all_filters_pushed = | ||
| vec![PushedDown::Yes; child_pushdown_result.parent_filters.len()]; | ||
| let propagation = if children_modified { | ||
| let updated_node = UnionExec::try_new(new_children)?; | ||
| FilterPushdownPropagation::with_parent_pushdown_result(all_filters_pushed) | ||
| .with_updated_node(updated_node) | ||
| } else { | ||
| FilterPushdownPropagation::with_parent_pushdown_result(all_filters_pushed) | ||
| }; | ||
|
|
||
| // Report all parent filters as supported since we've ensured they're applied | ||
| // on all children (either pushed down or via FilterExec) | ||
| Ok(propagation) | ||
| } | ||
| } | ||
|
|
||
| /// Combines multiple input streams by interleaving them. | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is main purpose for this pr
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This one really seems like it should be reproducible from an SLT test
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let me try, maybe one memory and one parquet file can reproduce
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added in https://github.com/apache/datafusion/pull/20145/changes#diff-cc932d43efc8c2d251d018ecaa22d7e898b4da64d5f216c08a089e13cecfb67f