From 3da6e8261220938e3c0e929815ce2b9b37d448eb Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 13:57:24 +0300 Subject: [PATCH 01/23] Convert ordering equivalence to vec (unit test) --- datafusion/physical-expr/src/equivalence.rs | 114 +++++++- datafusion/physical-expr/src/utils.rs | 285 +++++++++++++++++++- 2 files changed, 382 insertions(+), 17 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index ab3443424b031..0c611bd50cc0a 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -22,6 +22,8 @@ use arrow_schema::SortOptions; use std::collections::{HashMap, HashSet}; use std::hash::Hash; +use std::sync::Arc; +use crate::PhysicalSortExpr; /// Equivalence Properties is a vec of EquivalentClass. #[derive(Debug, Clone)] @@ -118,6 +120,7 @@ impl EquivalenceProperties { /// `OrderingEquivalenceProperties`, we can keep track of these equivalences /// and treat `a ASC` and `b DESC` as the same ordering requirement. pub type OrderingEquivalenceProperties = EquivalenceProperties; +pub type OrderingEquivalenceProperties2 = EquivalenceProperties>; /// EquivalentClass is a set of [`Column`]s or [`OrderedColumn`]s that are known /// to have the same value in all tuples in a relation. `EquivalentClass` @@ -185,6 +188,67 @@ impl EquivalentClass { } } +#[derive(Debug, Clone)] +pub struct EquivalentClass2> { + /// First element in the EquivalentClass + head: T, + /// Other equal columns + others: HashSet, +} + +impl EquivalentClass2 { + pub fn new(head: T, others: Vec) -> EquivalentClass2 { + EquivalentClass2 { + head, + others: HashSet::from_iter(others), + } + } + + pub fn head(&self) -> &T { + &self.head + } + + pub fn others(&self) -> &HashSet { + &self.others + } + + pub fn contains(&self, col: &T) -> bool { + self.head == *col || self.others.contains(col) + } + + pub fn insert(&mut self, col: T) -> bool { + self.head != col && self.others.insert(col) + } + + pub fn remove(&mut self, col: &T) -> bool { + let removed = self.others.remove(col); + if !removed && *col == self.head { + let one_col = self.others.iter().next().cloned(); + if let Some(col) = one_col { + let removed = self.others.remove(&col); + self.head = col; + removed + } else { + false + } + } else { + removed + } + } + + pub fn iter(&self) -> impl Iterator { + std::iter::once(&self.head).chain(self.others.iter()) + } + + pub fn len(&self) -> usize { + self.others.len() + 1 + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + /// This object represents a [`Column`] with a definite ordering. #[derive(Debug, Hash, PartialEq, Eq, Clone)] pub struct OrderedColumn { @@ -198,6 +262,16 @@ impl OrderedColumn { } } +impl Into for OrderedColumn{ + fn into(self) -> PhysicalSortExpr { + PhysicalSortExpr{ + expr: Arc::new(self.col) as _, + options: self.options + } + } +} + + trait ColumnAccessor { fn column(&self) -> &Column; } @@ -215,6 +289,7 @@ impl ColumnAccessor for OrderedColumn { } pub type OrderingEquivalentClass = EquivalentClass; +pub type OrderingEquivalentClass2 = EquivalentClass>; impl OrderingEquivalentClass { /// Finds the matching column inside the `OrderingEquivalentClass`. @@ -230,6 +305,30 @@ impl OrderingEquivalentClass { None } } + + fn update_with_aliases(&mut self, columns_map: &HashMap>) { + for (column, columns) in columns_map { + if self.head.col.eq(column) { + for col in columns { + self.insert(OrderedColumn { + col: col.clone(), + options: self.head.options, + }); + } + } else { + for item in self.others.clone() { + if item.col.eq(column) { + for col in columns { + self.insert(OrderedColumn { + col: col.clone(), + options: item.options, + }); + } + } + } + } + } + } } /// This function applies the given projection to the given equivalence @@ -276,19 +375,8 @@ pub fn project_ordering_equivalence_properties( output_eq: &mut OrderingEquivalenceProperties, ) { let mut ec_classes = input_eq.classes().to_vec(); - for (column, columns) in columns_map { - for class in ec_classes.iter_mut() { - if let Some(OrderedColumn { options, .. }) = class.get_matching_column(column) - { - for col in columns { - class.insert(OrderedColumn { - col: col.clone(), - options, - }); - } - break; - } - } + for class in ec_classes.iter_mut() { + class.update_with_aliases(columns_map); } prune_columns_to_remove(output_eq, &mut ec_classes); diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index a8a0625ca019b..831cd9299a5a6 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -15,10 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::equivalence::{ - EquivalenceProperties, EquivalentClass, OrderedColumn, OrderingEquivalenceProperties, - OrderingEquivalentClass, -}; +use crate::equivalence::{EquivalenceProperties, EquivalentClass, OrderedColumn, OrderingEquivalenceProperties, OrderingEquivalenceProperties2, OrderingEquivalentClass, OrderingEquivalentClass2}; use crate::expressions::{BinaryExpr, Column, UnKnownColumn}; use crate::{PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement}; @@ -35,6 +32,7 @@ use petgraph::stable_graph::StableGraph; use std::borrow::Borrow; use std::collections::HashMap; use std::collections::HashSet; +use std::ops::Range; use std::sync::Arc; /// Compare the two expr lists are equal no matter the order. @@ -265,6 +263,93 @@ pub fn normalize_sort_expr( ) } +fn get_ranges_inside(to_search: &[PhysicalSortExpr], section: &[PhysicalSortExpr]) -> Vec> { + let n_section = section.len(); + let n_end = if to_search.len() >= n_section { + to_search.len() - n_section + 1 + } else { + 0 + }; + let mut res = vec![]; + for idx in 0..n_end { + let end = idx + n_section; + if to_search[idx..end].eq(section){ + res.push(Range{start:idx, end}); + } + } + res +} + +fn get_ranges_inside2(to_search: &[T], section: &[T]) -> Vec> { + let n_section = section.len(); + let n_end = if to_search.len() >= n_section { + to_search.len() - n_section + 1 + } else { + 0 + }; + let mut res = vec![]; + for idx in 0..n_end { + let end = idx + n_section; + if to_search[idx..end].eq(section){ + res.push(Range{start:idx, end}); + } + } + res +} + +// fn get_range_inside2(to_search: &[T], section: &[T]) -> Option> { +// let n_section = section.len(); +// for idx in 0..to_search.len() - n_section + 1{ +// let end = idx + n_section; +// if to_search[idx..end].eq(section){ +// return Some(Range{start:idx, end}); +// } +// } +// None +// } + +fn collapse_vec(in_data: Vec) -> Vec{ + let mut out_data = vec![]; + for elem in in_data{ + if !out_data.contains(&elem){ + out_data.push(elem); + } + } + out_data +} + +pub fn normalize_sort_expr2( + sort_exprs: &[PhysicalSortExpr], + eq_properties: &[EquivalentClass], + ordering_eq_properties: &[OrderingEquivalentClass2], +) -> Vec { + let mut normalized_exprs = sort_exprs.iter().map(|sort_expr| normalize_sort_expr_with_equivalence_properties(sort_expr.clone(), eq_properties)).collect::>(); + for ordering_eq_class in ordering_eq_properties{ + for elem in ordering_eq_class.others() { + let elem: Vec = elem.clone().into_iter().map(|elem| elem.into()).collect::>(); + println!("normalized_exprs: {:?}", normalized_exprs); + println!("elem: {:?}", elem); + let ranges = get_ranges_inside(&normalized_exprs, &elem); + let mut offset: i64 = 0; + for Range{start, end} in ranges { + println!("start:{:?}, end:{:?}", start, end); + let head: Vec = ordering_eq_class.head().clone().into_iter().map(|elem| elem.into()).collect::>(); + println!("head:{:?}", head); + let updated_start: i64 = start as i64 + offset; + let updated_end: i64 = end as i64 + offset; + println!("updated_start: {:?}, updated_end:{:?}", updated_start, updated_end); + let range = end - start; + offset += head.len() as i64 - range as i64; + normalized_exprs.splice(updated_start as usize..updated_end as usize, head); + println!("normalized_exprs bef return:{:?}", normalized_exprs); + // break; + } + + } + } + collapse_vec(normalized_exprs) +} + pub fn normalize_sort_requirement( sort_requirement: PhysicalSortRequirement, eq_properties: &[EquivalentClass], @@ -302,6 +387,27 @@ pub fn ordering_satisfy< } } +/// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. +pub fn ordering_satisfy2< + F: FnOnce() -> EquivalenceProperties, + F2: FnOnce() -> OrderingEquivalenceProperties2, +>( + provided: Option<&[PhysicalSortExpr]>, + required: Option<&[PhysicalSortExpr]>, + equal_properties: F, + ordering_equal_properties: F2, +) -> bool { + match (provided, required) { + (_, None) => true, + (None, Some(_)) => false, + (Some(provided), Some(required)) => ordering_satisfy_concrete2( + provided, + required, + equal_properties, + ordering_equal_properties, + ), + } +} /// Checks whether the required [`PhysicalSortExpr`]s are satisfied by the /// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_concrete< @@ -337,6 +443,36 @@ pub fn ordering_satisfy_concrete< .all(|(req, given)| given == req) } +/// Checks whether the required [`PhysicalSortExpr`]s are satisfied by the +/// provided [`PhysicalSortExpr`]s. +pub fn ordering_satisfy_concrete2< + F: FnOnce() -> EquivalenceProperties, + F2: FnOnce() -> OrderingEquivalenceProperties2, +>( + provided: &[PhysicalSortExpr], + required: &[PhysicalSortExpr], + equal_properties: F, + ordering_equal_properties: F2, +) -> bool { + let oeq_properties = ordering_equal_properties(); + let ordering_eq_classes = oeq_properties.classes(); + let eq_properties = equal_properties(); + let eq_classes = eq_properties.classes(); + let mut required_normalized = normalize_sort_expr2(required, eq_classes, ordering_eq_classes); + // TODO: Add collapse procedure + let mut provided_normalized = normalize_sort_expr2(provided, eq_classes, ordering_eq_classes); + // TODO: Add collapse procedure + println!("required_normalized: {:?}", required_normalized); + println!("provided_normalized: {:?}", provided_normalized); + if required_normalized.len() > provided_normalized.len() { + return false; + } + required_normalized + .into_iter() + .zip(provided_normalized) + .all(|(req, given)| given == req) +} + /// Checks whether the given [`PhysicalSortRequirement`]s are satisfied by the /// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_requirement< @@ -713,6 +849,7 @@ mod tests { use datafusion_common::{Result, ScalarValue}; use std::fmt::{Display, Formatter}; + use crate::equivalence::OrderingEquivalenceProperties2; use arrow_schema::{DataType, Field, Schema}; use petgraph::visit::Bfs; use std::sync::Arc; @@ -805,6 +942,49 @@ mod tests { Ok((test_schema, eq_properties, ordering_eq_properties)) } + fn create_test_params2() -> Result<( + SchemaRef, + EquivalenceProperties, + OrderingEquivalenceProperties2, + )> { + // Assume schema satisfies ordering a ASC NULLS LAST + // and d ASC NULLS LAST, b ASC NULLS LAST and e DESC NULLS FIRST, b ASC NULLS LAST + // Assume that column a and c are aliases. + let col_a = &Column::new("a", 0); + let col_b = &Column::new("b", 1); + let col_c = &Column::new("c", 2); + let col_d = &Column::new("d", 3); + let col_e = &Column::new("e", 4); + let option1 = SortOptions { + descending: false, + nulls_first: false, + }; + let option2 = SortOptions { + descending: true, + nulls_first: true, + }; + let test_schema = create_test_schema()?; + let mut eq_properties = EquivalenceProperties::new(test_schema.clone()); + eq_properties.add_equal_conditions((col_a, col_c)); + let mut ordering_eq_properties = + OrderingEquivalenceProperties2::new(test_schema.clone()); + ordering_eq_properties.add_equal_conditions(( + &vec![OrderedColumn::new(col_a.clone(), option1)], + &vec![ + OrderedColumn::new(col_d.clone(), option1), + OrderedColumn::new(col_b.clone(), option1), + ], + )); + ordering_eq_properties.add_equal_conditions(( + &vec![OrderedColumn::new(col_a.clone(), option1)], + &vec![ + OrderedColumn::new(col_e.clone(), option2), + OrderedColumn::new(col_b.clone(), option1), + ], + )); + Ok((test_schema, eq_properties, ordering_eq_properties)) + } + #[test] fn test_build_dag() -> Result<()> { let schema = Schema::new(vec![ @@ -1155,6 +1335,84 @@ mod tests { Ok(()) } + #[test] + fn test_ordering_satisfy_with_equivalence2() -> Result<()> { + let col_a = &Column::new("a", 0); + let col_b = &Column::new("b", 1); + let col_c = &Column::new("c", 2); + let col_d = &Column::new("d", 3); + let col_e = &Column::new("e", 4); + let option1 = SortOptions { + descending: false, + nulls_first: false, + }; + let option2 = SortOptions { + descending: true, + nulls_first: true, + }; + // The schema is ordered by a ASC NULLS LAST, b ASC NULLS LAST + let provided = vec![ + PhysicalSortExpr { + expr: Arc::new(col_a.clone()), + options: option1, + }, + PhysicalSortExpr { + expr: Arc::new(col_b.clone()), + options: option1, + }, + ]; + let provided = Some(&provided[..]); + let (_test_schema, eq_properties, ordering_eq_properties) = + create_test_params2()?; + // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function + let requirements = vec![ + // // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it + // (vec![(col_a, option1)], true), + // (vec![(col_a, option2)], false), + // // Test whether equivalence works as expected + // (vec![(col_c, option1)], true), + // (vec![(col_c, option2)], false), + // // Test whether ordering equivalence works as expected + // (vec![(col_d, option1)], false), + // (vec![(col_d, option1), (col_b, option1)], true), + // (vec![(col_d, option2), (col_b, option1)], false), + // (vec![(col_e, option2), (col_b, option1)], true), + // (vec![(col_e, option1), (col_b, option1)], false), + (vec![(col_d, option1), (col_b, option1), (col_d, option1), (col_b, option1)], true), + (vec![(col_d, option1), (col_b, option1), (col_e, option2), (col_b, option1)], true), + (vec![(col_d, option1), (col_b, option1), (col_d, option2), (col_b, option1)], false), + (vec![(col_d, option1), (col_b, option1), (col_e, option1), (col_b, option1)], false), + + // (vec![(col_d, option1)], true), + // (vec![(col_d, option2)], false), + // (vec![(col_e, option2)], true), + // (vec![(col_e, option1)], false), + ]; + for (cols, expected) in requirements { + let err_msg = format!("Error in test case:{cols:?}"); + let required = cols + .into_iter() + .map(|(col, options)| PhysicalSortExpr { + expr: Arc::new(col.clone()), + options, + }) + .collect::>(); + + let required = Some(&required[..]); + assert_eq!( + ordering_satisfy2( + provided, + required, + || eq_properties.clone(), + || ordering_eq_properties.clone(), + ), + expected, + "{err_msg}" + ); + } + Ok(()) + } + #[test] fn test_reassign_predicate_columns_in_list() { let int_field = Field::new("should_not_matter", DataType::Int64, true); @@ -1491,4 +1749,23 @@ mod tests { Ok(()) } + + #[test] + fn test_get_range_inside() -> Result<()> { + let empty_vec: Vec> = Vec::new(); + assert_eq!(get_ranges_inside2(&[1,2,3], &[1,2]), vec![Range{start: 0, end: 2}]); + assert_eq!(get_ranges_inside2(&[1,2,3], &[2,3]), vec![Range{start: 1, end: 3}]); + assert_eq!(get_ranges_inside2(&[1,2,3], &[1,3]), empty_vec); + assert_eq!(get_ranges_inside2(&[1,2,3], &[1, 2,3]), vec![Range{start: 0, end: 3}]); + assert_eq!(get_ranges_inside2(&[1,2,3], &[3, 2]), empty_vec); + Ok(()) + } + + #[test] + fn test_collapse_vec() -> Result<()> { + assert_eq!(collapse_vec(vec![1, 2, 3]), vec![1, 2, 3]); + assert_eq!(collapse_vec(vec![1, 2, 3, 2, 3]), vec![1, 2, 3]); + assert_eq!(collapse_vec(vec![3, 1, 2, 3, 2, 3]), vec![3, 1, 2]); + Ok(()) + } } From 8244cb72ea0ee52c1afe3da256f617fcca986159 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 14:53:49 +0300 Subject: [PATCH 02/23] compiles --- .../core/src/physical_plan/aggregates/mod.rs | 10 +- datafusion/core/src/physical_plan/mod.rs | 8 +- .../core/src/physical_plan/projection.rs | 9 +- .../windows/bounded_window_agg_exec.rs | 3 +- .../core/src/physical_plan/windows/mod.rs | 54 +- .../physical_plan/windows/window_agg_exec.rs | 3 +- datafusion/physical-expr/src/equivalence.rs | 62 ++- datafusion/physical-expr/src/utils.rs | 475 +++++++++++++----- 8 files changed, 457 insertions(+), 167 deletions(-) diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs index 247dfc27784ea..4a66d06ce180c 100644 --- a/datafusion/core/src/physical_plan/aggregates/mod.rs +++ b/datafusion/core/src/physical_plan/aggregates/mod.rs @@ -55,6 +55,7 @@ mod row_hash; mod utils; pub use datafusion_expr::AggregateFunction; +use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties2; pub use datafusion_physical_expr::expressions::create_aggregate_expr; /// Hash aggregate modes @@ -346,7 +347,7 @@ fn output_group_expr_helper(group_by: &PhysicalGroupBy) -> Vec EquivalenceProperties, - F2: Fn() -> OrderingEquivalenceProperties, + F2: Fn() -> OrderingEquivalenceProperties2, >( order_by_expr: &[Option>], eq_properties: F, @@ -1074,6 +1075,7 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::{DataFusionError, Result, ScalarValue}; + use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties2; use datafusion_physical_expr::expressions::{ lit, ApproxDistinct, Column, Count, Median, }; @@ -1734,10 +1736,10 @@ mod tests { let col_c = Column::new("c", 2); let col_d = Column::new("d", 3); eq_properties.add_equal_conditions((&col_a, &col_b)); - let mut ordering_eq_properties = OrderingEquivalenceProperties::new(test_schema); + let mut ordering_eq_properties = OrderingEquivalenceProperties2::new(test_schema); ordering_eq_properties.add_equal_conditions(( - &OrderedColumn::new(col_a.clone(), options1), - &OrderedColumn::new(col_c.clone(), options2), + &vec![OrderedColumn::new(col_a.clone(), options1)], + &vec![OrderedColumn::new(col_c.clone(), options2)], )); let order_by_exprs = vec![ diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/core/src/physical_plan/mod.rs index c8edf701cf05c..dac8774d7e786 100644 --- a/datafusion/core/src/physical_plan/mod.rs +++ b/datafusion/core/src/physical_plan/mod.rs @@ -32,7 +32,9 @@ use arrow::record_batch::RecordBatch; pub use datafusion_expr::Accumulator; pub use datafusion_expr::ColumnarValue; pub use datafusion_physical_expr::aggregate::row_accumulator::RowAccumulator; -use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties; +use datafusion_physical_expr::equivalence::{ + OrderingEquivalenceProperties, OrderingEquivalenceProperties2, +}; pub use display::DisplayFormatType; use futures::stream::{Stream, TryStreamExt}; use std::fmt; @@ -189,8 +191,8 @@ pub trait ExecutionPlan: Debug + Send + Sync { } /// Get the OrderingEquivalenceProperties within the plan - fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { - OrderingEquivalenceProperties::new(self.schema()) + fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties2 { + OrderingEquivalenceProperties2::new(self.schema()) } /// Get a list of child execution plans that provide the input for this plan. The returned list diff --git a/datafusion/core/src/physical_plan/projection.rs b/datafusion/core/src/physical_plan/projection.rs index f2775079fc4a4..17ca50ce1c9ba 100644 --- a/datafusion/core/src/physical_plan/projection.rs +++ b/datafusion/core/src/physical_plan/projection.rs @@ -41,6 +41,9 @@ use super::expressions::{Column, PhysicalSortExpr}; use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use super::{RecordBatchStream, SendableRecordBatchStream, Statistics}; +use datafusion_physical_expr::equivalence::{ + project_ordering_equivalence_properties2, OrderingEquivalenceProperties2, +}; use datafusion_physical_expr::{ normalize_out_expr_with_columns_map, project_equivalence_properties, project_ordering_equivalence_properties, OrderingEquivalenceProperties, @@ -216,9 +219,9 @@ impl ExecutionPlan for ProjectionExec { new_properties } - fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { - let mut new_properties = OrderingEquivalenceProperties::new(self.schema()); - project_ordering_equivalence_properties( + fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties2 { + let mut new_properties = OrderingEquivalenceProperties2::new(self.schema()); + project_ordering_equivalence_properties2( self.input.ordering_equivalence_properties(), &self.columns_map, &mut new_properties, diff --git a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs index 95b482ef24963..f63aae5d1c9fa 100644 --- a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs @@ -60,6 +60,7 @@ use datafusion_common::utils::{ }; use datafusion_common::DataFusionError; use datafusion_expr::ColumnarValue; +use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties2; use datafusion_physical_expr::hash_utils::create_hashes; use datafusion_physical_expr::window::{ PartitionBatchState, PartitionBatches, PartitionKey, PartitionWindowAggStates, @@ -261,7 +262,7 @@ impl ExecutionPlan for BoundedWindowAggExec { } /// Get the OrderingEquivalenceProperties within the plan - fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { + fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties2 { window_ordering_equivalence(&self.schema, &self.input, &self.window_expr) } diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/core/src/physical_plan/windows/mod.rs index f6fe3bcaee9e7..01d6c52e238f9 100644 --- a/datafusion/core/src/physical_plan/windows/mod.rs +++ b/datafusion/core/src/physical_plan/windows/mod.rs @@ -47,6 +47,7 @@ mod window_agg_exec; pub use bounded_window_agg_exec::BoundedWindowAggExec; pub use bounded_window_agg_exec::PartitionSearchMode; use datafusion_common::utils::longest_consecutive_prefix; +use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties2; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::utils::{convert_to_expr, get_indices_of_matching_exprs}; pub use datafusion_physical_expr::window::{ @@ -251,10 +252,10 @@ pub(crate) fn window_ordering_equivalence( schema: &SchemaRef, input: &Arc, window_expr: &[Arc], -) -> OrderingEquivalenceProperties { +) -> OrderingEquivalenceProperties2 { // We need to update the schema, so we can not directly use // `input.ordering_equivalence_properties()`. - let mut result = OrderingEquivalenceProperties::new(schema.clone()); + let mut result = OrderingEquivalenceProperties2::new(schema.clone()); result.extend( input .ordering_equivalence_properties() @@ -263,6 +264,21 @@ pub(crate) fn window_ordering_equivalence( .cloned(), ); let out_ordering = input.output_ordering().unwrap_or(&[]); + let mut out_ordering_normalized = vec![]; + for elem in out_ordering { + // Normalize expression, as we search for ordering equivalences + // on normalized versions: + let normalized = normalize_expr_with_equivalence_properties( + elem.expr.clone(), + input.equivalence_properties().classes(), + ); + if let Some(column) = normalized.as_any().downcast_ref::() { + out_ordering_normalized + .push(OrderedColumn::new(column.clone(), elem.options)); + } else { + break; + } + } for expr in window_expr { if let Some(builtin_window_expr) = expr.as_any().downcast_ref::() @@ -275,28 +291,18 @@ pub(crate) fn window_ordering_equivalence( .is::() { // If there is an existing ordering, add new ordering as an equivalence: - if let Some(first) = out_ordering.first() { - // Normalize expression, as we search for ordering equivalences - // on normalized versions: - let normalized = normalize_expr_with_equivalence_properties( - first.expr.clone(), - input.equivalence_properties().classes(), - ); - if let Some(column) = normalized.as_any().downcast_ref::() { - let column_info = - schema.column_with_name(expr.field().unwrap().name()); - if let Some((idx, field)) = column_info { - let lhs = OrderedColumn::new(column.clone(), first.options); - let options = SortOptions { - descending: false, - nulls_first: false, - }; // ASC, NULLS LAST - let rhs = OrderedColumn::new( - Column::new(field.name(), idx), - options, - ); - result.add_equal_conditions((&lhs, &rhs)); - } + if !out_ordering_normalized.is_empty() { + let options = SortOptions { + descending: false, + nulls_first: false, + }; // ASC, NULLS LAST + let column_info = + schema.column_with_name(expr.field().unwrap().name()); + if let Some((idx, field)) = column_info { + let rhs = + OrderedColumn::new(Column::new(field.name(), idx), options); + result + .add_equal_conditions((&out_ordering_normalized, &vec![rhs])); } } } diff --git a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs index dc0302d77b983..f3f416bd7ae0c 100644 --- a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs @@ -42,6 +42,7 @@ use arrow::{ }; use datafusion_common::utils::{evaluate_partition_ranges, get_at_indices}; use datafusion_common::DataFusionError; +use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties2; use datafusion_physical_expr::{OrderingEquivalenceProperties, PhysicalSortRequirement}; use futures::stream::Stream; use futures::{ready, StreamExt}; @@ -192,7 +193,7 @@ impl ExecutionPlan for WindowAggExec { } /// Get the OrderingEquivalenceProperties within the plan - fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { + fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties2 { window_ordering_equivalence(&self.schema, &self.input, &self.window_expr) } diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 0c611bd50cc0a..4dd04a9160236 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -20,10 +20,10 @@ use crate::expressions::Column; use arrow::datatypes::SchemaRef; use arrow_schema::SortOptions; +use crate::{PhysicalSortExpr, PhysicalSortRequirement}; use std::collections::{HashMap, HashSet}; use std::hash::Hash; use std::sync::Arc; -use crate::PhysicalSortExpr; /// Equivalence Properties is a vec of EquivalentClass. #[derive(Debug, Clone)] @@ -262,15 +262,23 @@ impl OrderedColumn { } } -impl Into for OrderedColumn{ +impl Into for OrderedColumn { fn into(self) -> PhysicalSortExpr { - PhysicalSortExpr{ + PhysicalSortExpr { expr: Arc::new(self.col) as _, - options: self.options + options: self.options, } } } +impl Into for OrderedColumn { + fn into(self) -> PhysicalSortRequirement { + PhysicalSortRequirement { + expr: Arc::new(self.col) as _, + options: Some(self.options), + } + } +} trait ColumnAccessor { fn column(&self) -> &Column; @@ -331,6 +339,30 @@ impl OrderingEquivalentClass { } } +impl OrderingEquivalentClass2 { + fn update_with_aliases(&mut self, columns_map: &HashMap>) { + for (column, columns) in columns_map { + for ordering in vec![self.head.clone()] + .iter() + .chain(self.others.clone().iter()) + { + for (idx, elem) in ordering.iter().enumerate() { + if elem.col.eq(column) { + let mut normalized = self.head.clone(); + for col in columns { + normalized[idx] = OrderedColumn { + col: col.clone(), + options: self.head[idx].options, + }; + self.insert(normalized.clone()); + } + } + } + } + } + } +} + /// This function applies the given projection to the given equivalence /// properties to compute the resulting (projected) equivalence properties; e.g. /// 1) Adding an alias, which can introduce additional equivalence properties, @@ -383,6 +415,28 @@ pub fn project_ordering_equivalence_properties( output_eq.extend(ec_classes); } +/// This function applies the given projection to the given ordering +/// equivalence properties to compute the resulting (projected) ordering +/// equivalence properties; e.g. +/// 1) Adding an alias, which can introduce additional ordering equivalence +/// properties, as in Projection(a, a as a1, a as a2) extends global ordering +/// of a to a1 and a2. +/// 2) Truncate the [`OrderingEquivalentClass`]es that are not in the output schema. +pub fn project_ordering_equivalence_properties2( + input_eq: OrderingEquivalenceProperties2, + columns_map: &HashMap>, + output_eq: &mut OrderingEquivalenceProperties2, +) { + let mut ec_classes = input_eq.classes().to_vec(); + for class in ec_classes.iter_mut() { + class.update_with_aliases(columns_map); + } + + // prune_columns_to_remove(output_eq, &mut ec_classes); + // TODO: Add pruning + output_eq.extend(ec_classes); +} + fn prune_columns_to_remove( eq_properties: &EquivalenceProperties, eq_classes: &mut Vec>, diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 831cd9299a5a6..5de7bf57c5d8b 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::equivalence::{EquivalenceProperties, EquivalentClass, OrderedColumn, OrderingEquivalenceProperties, OrderingEquivalenceProperties2, OrderingEquivalentClass, OrderingEquivalentClass2}; +use crate::equivalence::{ + EquivalenceProperties, EquivalentClass, OrderedColumn, OrderingEquivalenceProperties, + OrderingEquivalenceProperties2, OrderingEquivalentClass, OrderingEquivalentClass2, +}; use crate::expressions::{BinaryExpr, Column, UnKnownColumn}; use crate::{PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement}; @@ -263,22 +266,22 @@ pub fn normalize_sort_expr( ) } -fn get_ranges_inside(to_search: &[PhysicalSortExpr], section: &[PhysicalSortExpr]) -> Vec> { - let n_section = section.len(); - let n_end = if to_search.len() >= n_section { - to_search.len() - n_section + 1 - } else { - 0 - }; - let mut res = vec![]; - for idx in 0..n_end { - let end = idx + n_section; - if to_search[idx..end].eq(section){ - res.push(Range{start:idx, end}); - } - } - res -} +// fn get_ranges_inside(to_search: &[PhysicalSortExpr], section: &[PhysicalSortExpr]) -> Vec> { +// let n_section = section.len(); +// let n_end = if to_search.len() >= n_section { +// to_search.len() - n_section + 1 +// } else { +// 0 +// }; +// let mut res = vec![]; +// for idx in 0..n_end { +// let end = idx + n_section; +// if to_search[idx..end].eq(section){ +// res.push(Range{start:idx, end}); +// } +// } +// res +// } fn get_ranges_inside2(to_search: &[T], section: &[T]) -> Vec> { let n_section = section.len(); @@ -290,8 +293,8 @@ fn get_ranges_inside2(to_search: &[T], section: &[T]) -> Vec(to_search: &[T], section: &[T]) -> Vec(in_data: Vec) -> Vec{ +fn collapse_vec(in_data: Vec) -> Vec { let mut out_data = vec![]; - for elem in in_data{ - if !out_data.contains(&elem){ + for elem in in_data { + if !out_data.contains(&elem) { out_data.push(elem); } } @@ -323,28 +326,100 @@ pub fn normalize_sort_expr2( eq_properties: &[EquivalentClass], ordering_eq_properties: &[OrderingEquivalentClass2], ) -> Vec { - let mut normalized_exprs = sort_exprs.iter().map(|sort_expr| normalize_sort_expr_with_equivalence_properties(sort_expr.clone(), eq_properties)).collect::>(); - for ordering_eq_class in ordering_eq_properties{ + let mut normalized_exprs = sort_exprs + .iter() + .map(|sort_expr| { + normalize_sort_expr_with_equivalence_properties( + sort_expr.clone(), + eq_properties, + ) + }) + .collect::>(); + for ordering_eq_class in ordering_eq_properties { for elem in ordering_eq_class.others() { - let elem: Vec = elem.clone().into_iter().map(|elem| elem.into()).collect::>(); + let elem: Vec = elem + .clone() + .into_iter() + .map(|elem| elem.into()) + .collect::>(); println!("normalized_exprs: {:?}", normalized_exprs); println!("elem: {:?}", elem); - let ranges = get_ranges_inside(&normalized_exprs, &elem); + let ranges = get_ranges_inside2(&normalized_exprs, &elem); let mut offset: i64 = 0; - for Range{start, end} in ranges { + for Range { start, end } in ranges { println!("start:{:?}, end:{:?}", start, end); - let head: Vec = ordering_eq_class.head().clone().into_iter().map(|elem| elem.into()).collect::>(); + let head: Vec = ordering_eq_class + .head() + .clone() + .into_iter() + .map(|elem| elem.into()) + .collect::>(); println!("head:{:?}", head); let updated_start: i64 = start as i64 + offset; let updated_end: i64 = end as i64 + offset; - println!("updated_start: {:?}, updated_end:{:?}", updated_start, updated_end); + println!( + "updated_start: {:?}, updated_end:{:?}", + updated_start, updated_end + ); let range = end - start; offset += head.len() as i64 - range as i64; - normalized_exprs.splice(updated_start as usize..updated_end as usize, head); + normalized_exprs + .splice(updated_start as usize..updated_end as usize, head); println!("normalized_exprs bef return:{:?}", normalized_exprs); // break; } + } + } + collapse_vec(normalized_exprs) +} +pub fn normalize_sort_requirements2( + sort_exprs: &[PhysicalSortRequirement], + eq_properties: &[EquivalentClass], + ordering_eq_properties: &[OrderingEquivalentClass2], +) -> Vec { + let mut normalized_exprs = sort_exprs + .iter() + .map(|sort_expr| { + normalize_sort_requirement_with_equivalence_properties( + sort_expr.clone(), + eq_properties, + ) + }) + .collect::>(); + for ordering_eq_class in ordering_eq_properties { + for elem in ordering_eq_class.others() { + let elem: Vec = elem + .clone() + .into_iter() + .map(|elem| elem.into()) + .collect::>(); + println!("normalized_exprs: {:?}", normalized_exprs); + println!("elem: {:?}", elem); + let ranges = get_ranges_inside2(&normalized_exprs, &elem); + let mut offset: i64 = 0; + for Range { start, end } in ranges { + println!("start:{:?}, end:{:?}", start, end); + let head: Vec = ordering_eq_class + .head() + .clone() + .into_iter() + .map(|elem| elem.into()) + .collect::>(); + println!("head:{:?}", head); + let updated_start: i64 = start as i64 + offset; + let updated_end: i64 = end as i64 + offset; + println!( + "updated_start: {:?}, updated_end:{:?}", + updated_start, updated_end + ); + let range = end - start; + offset += head.len() as i64 - range as i64; + normalized_exprs + .splice(updated_start as usize..updated_end as usize, head); + println!("normalized_exprs bef return:{:?}", normalized_exprs); + // break; + } } } collapse_vec(normalized_exprs) @@ -365,10 +440,32 @@ pub fn normalize_sort_requirement( ) } +// /// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. +// pub fn ordering_satisfy< +// F: FnOnce() -> EquivalenceProperties, +// F2: FnOnce() -> OrderingEquivalenceProperties, +// >( +// provided: Option<&[PhysicalSortExpr]>, +// required: Option<&[PhysicalSortExpr]>, +// equal_properties: F, +// ordering_equal_properties: F2, +// ) -> bool { +// match (provided, required) { +// (_, None) => true, +// (None, Some(_)) => false, +// (Some(provided), Some(required)) => ordering_satisfy_concrete( +// provided, +// required, +// equal_properties, +// ordering_equal_properties, +// ), +// } +// } + /// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. pub fn ordering_satisfy< F: FnOnce() -> EquivalenceProperties, - F2: FnOnce() -> OrderingEquivalenceProperties, + F2: FnOnce() -> OrderingEquivalenceProperties2, >( provided: Option<&[PhysicalSortExpr]>, required: Option<&[PhysicalSortExpr]>, @@ -387,65 +484,44 @@ pub fn ordering_satisfy< } } -/// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. -pub fn ordering_satisfy2< - F: FnOnce() -> EquivalenceProperties, - F2: FnOnce() -> OrderingEquivalenceProperties2, ->( - provided: Option<&[PhysicalSortExpr]>, - required: Option<&[PhysicalSortExpr]>, - equal_properties: F, - ordering_equal_properties: F2, -) -> bool { - match (provided, required) { - (_, None) => true, - (None, Some(_)) => false, - (Some(provided), Some(required)) => ordering_satisfy_concrete2( - provided, - required, - equal_properties, - ordering_equal_properties, - ), - } -} -/// Checks whether the required [`PhysicalSortExpr`]s are satisfied by the -/// provided [`PhysicalSortExpr`]s. -pub fn ordering_satisfy_concrete< - F: FnOnce() -> EquivalenceProperties, - F2: FnOnce() -> OrderingEquivalenceProperties, ->( - provided: &[PhysicalSortExpr], - required: &[PhysicalSortExpr], - equal_properties: F, - ordering_equal_properties: F2, -) -> bool { - let oeq_properties = ordering_equal_properties(); - let ordering_eq_classes = oeq_properties.classes(); - let eq_properties = equal_properties(); - let eq_classes = eq_properties.classes(); - let mut required_normalized = Vec::new(); - for expr in required { - let item = normalize_sort_expr(expr.clone(), eq_classes, ordering_eq_classes); - if !required_normalized.contains(&item) { - required_normalized.push(item); - } - } - let provided_normalized = provided - .iter() - .map(|e| normalize_sort_expr(e.clone(), eq_classes, ordering_eq_classes)) - .collect::>(); - if required_normalized.len() > provided_normalized.len() { - return false; - } - required_normalized - .into_iter() - .zip(provided_normalized) - .all(|(req, given)| given == req) -} +// /// Checks whether the required [`PhysicalSortExpr`]s are satisfied by the +// /// provided [`PhysicalSortExpr`]s. +// pub fn ordering_satisfy_concrete< +// F: FnOnce() -> EquivalenceProperties, +// F2: FnOnce() -> OrderingEquivalenceProperties, +// >( +// provided: &[PhysicalSortExpr], +// required: &[PhysicalSortExpr], +// equal_properties: F, +// ordering_equal_properties: F2, +// ) -> bool { +// let oeq_properties = ordering_equal_properties(); +// let ordering_eq_classes = oeq_properties.classes(); +// let eq_properties = equal_properties(); +// let eq_classes = eq_properties.classes(); +// let mut required_normalized = Vec::new(); +// for expr in required { +// let item = normalize_sort_expr(expr.clone(), eq_classes, ordering_eq_classes); +// if !required_normalized.contains(&item) { +// required_normalized.push(item); +// } +// } +// let provided_normalized = provided +// .iter() +// .map(|e| normalize_sort_expr(e.clone(), eq_classes, ordering_eq_classes)) +// .collect::>(); +// if required_normalized.len() > provided_normalized.len() { +// return false; +// } +// required_normalized +// .into_iter() +// .zip(provided_normalized) +// .all(|(req, given)| given == req) +// } /// Checks whether the required [`PhysicalSortExpr`]s are satisfied by the /// provided [`PhysicalSortExpr`]s. -pub fn ordering_satisfy_concrete2< +pub fn ordering_satisfy_concrete< F: FnOnce() -> EquivalenceProperties, F2: FnOnce() -> OrderingEquivalenceProperties2, >( @@ -458,9 +534,11 @@ pub fn ordering_satisfy_concrete2< let ordering_eq_classes = oeq_properties.classes(); let eq_properties = equal_properties(); let eq_classes = eq_properties.classes(); - let mut required_normalized = normalize_sort_expr2(required, eq_classes, ordering_eq_classes); + let mut required_normalized = + normalize_sort_expr2(required, eq_classes, ordering_eq_classes); // TODO: Add collapse procedure - let mut provided_normalized = normalize_sort_expr2(provided, eq_classes, ordering_eq_classes); + let mut provided_normalized = + normalize_sort_expr2(provided, eq_classes, ordering_eq_classes); // TODO: Add collapse procedure println!("required_normalized: {:?}", required_normalized); println!("provided_normalized: {:?}", provided_normalized); @@ -473,11 +551,34 @@ pub fn ordering_satisfy_concrete2< .all(|(req, given)| given == req) } +// /// Checks whether the given [`PhysicalSortRequirement`]s are satisfied by the +// /// provided [`PhysicalSortExpr`]s. +// pub fn ordering_satisfy_requirement< +// F: FnOnce() -> EquivalenceProperties, +// F2: FnOnce() -> OrderingEquivalenceProperties, +// >( +// provided: Option<&[PhysicalSortExpr]>, +// required: Option<&[PhysicalSortRequirement]>, +// equal_properties: F, +// ordering_equal_properties: F2, +// ) -> bool { +// match (provided, required) { +// (_, None) => true, +// (None, Some(_)) => false, +// (Some(provided), Some(required)) => ordering_satisfy_requirement_concrete( +// provided, +// required, +// equal_properties, +// ordering_equal_properties, +// ), +// } +// } + /// Checks whether the given [`PhysicalSortRequirement`]s are satisfied by the /// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_requirement< F: FnOnce() -> EquivalenceProperties, - F2: FnOnce() -> OrderingEquivalenceProperties, + F2: FnOnce() -> OrderingEquivalenceProperties2, >( provided: Option<&[PhysicalSortExpr]>, required: Option<&[PhysicalSortRequirement]>, @@ -496,11 +597,47 @@ pub fn ordering_satisfy_requirement< } } +// /// Checks whether the given [`PhysicalSortRequirement`]s are satisfied by the +// /// provided [`PhysicalSortExpr`]s. +// pub fn ordering_satisfy_requirement_concrete< +// F: FnOnce() -> EquivalenceProperties, +// F2: FnOnce() -> OrderingEquivalenceProperties, +// >( +// provided: &[PhysicalSortExpr], +// required: &[PhysicalSortRequirement], +// equal_properties: F, +// ordering_equal_properties: F2, +// ) -> bool { +// let oeq_properties = ordering_equal_properties(); +// let ordering_eq_classes = oeq_properties.classes(); +// let eq_properties = equal_properties(); +// let eq_classes = eq_properties.classes(); +// let mut required_normalized = Vec::new(); +// for req in required { +// let item = +// normalize_sort_requirement(req.clone(), eq_classes, ordering_eq_classes); +// if !required_normalized.contains(&item) { +// required_normalized.push(item); +// } +// } +// let provided_normalized = provided +// .iter() +// .map(|e| normalize_sort_expr(e.clone(), eq_classes, ordering_eq_classes)) +// .collect::>(); +// if required_normalized.len() > provided_normalized.len() { +// return false; +// } +// required_normalized +// .into_iter() +// .zip(provided_normalized) +// .all(|(req, given)| given.satisfy(&req)) +// } + /// Checks whether the given [`PhysicalSortRequirement`]s are satisfied by the /// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_requirement_concrete< F: FnOnce() -> EquivalenceProperties, - F2: FnOnce() -> OrderingEquivalenceProperties, + F2: FnOnce() -> OrderingEquivalenceProperties2, >( provided: &[PhysicalSortExpr], required: &[PhysicalSortRequirement], @@ -511,18 +648,10 @@ pub fn ordering_satisfy_requirement_concrete< let ordering_eq_classes = oeq_properties.classes(); let eq_properties = equal_properties(); let eq_classes = eq_properties.classes(); - let mut required_normalized = Vec::new(); - for req in required { - let item = - normalize_sort_requirement(req.clone(), eq_classes, ordering_eq_classes); - if !required_normalized.contains(&item) { - required_normalized.push(item); - } - } - let provided_normalized = provided - .iter() - .map(|e| normalize_sort_expr(e.clone(), eq_classes, ordering_eq_classes)) - .collect::>(); + let mut required_normalized = + normalize_sort_requirements2(required, eq_classes, ordering_eq_classes); + let mut provided_normalized = + normalize_sort_expr2(provided, eq_classes, ordering_eq_classes); if required_normalized.len() > provided_normalized.len() { return false; } @@ -532,10 +661,33 @@ pub fn ordering_satisfy_requirement_concrete< .all(|(req, given)| given.satisfy(&req)) } +// /// Checks whether the given [`PhysicalSortRequirement`]s are equal or more +// /// specific than the provided [`PhysicalSortRequirement`]s. +// pub fn requirements_compatible< +// F: FnOnce() -> OrderingEquivalenceProperties, +// F2: FnOnce() -> EquivalenceProperties, +// >( +// provided: Option<&[PhysicalSortRequirement]>, +// required: Option<&[PhysicalSortRequirement]>, +// ordering_equal_properties: F, +// equal_properties: F2, +// ) -> bool { +// match (provided, required) { +// (_, None) => true, +// (None, Some(_)) => false, +// (Some(provided), Some(required)) => requirements_compatible_concrete( +// provided, +// required, +// ordering_equal_properties, +// equal_properties, +// ), +// } +// } + /// Checks whether the given [`PhysicalSortRequirement`]s are equal or more /// specific than the provided [`PhysicalSortRequirement`]s. pub fn requirements_compatible< - F: FnOnce() -> OrderingEquivalenceProperties, + F: FnOnce() -> OrderingEquivalenceProperties2, F2: FnOnce() -> EquivalenceProperties, >( provided: Option<&[PhysicalSortRequirement]>, @@ -555,10 +707,46 @@ pub fn requirements_compatible< } } +// /// Checks whether the given [`PhysicalSortRequirement`]s are equal or more +// /// specific than the provided [`PhysicalSortRequirement`]s. +// fn requirements_compatible_concrete< +// F: FnOnce() -> OrderingEquivalenceProperties, +// F2: FnOnce() -> EquivalenceProperties, +// >( +// provided: &[PhysicalSortRequirement], +// required: &[PhysicalSortRequirement], +// ordering_equal_properties: F, +// equal_properties: F2, +// ) -> bool { +// let oeq_properties = ordering_equal_properties(); +// let ordering_eq_classes = oeq_properties.classes(); +// let eq_properties = equal_properties(); +// let eq_classes = eq_properties.classes(); +// let mut required_normalized = Vec::new(); +// for req in required { +// let item = +// normalize_sort_requirement(req.clone(), eq_classes, ordering_eq_classes); +// if !required_normalized.contains(&item) { +// required_normalized.push(item); +// } +// } +// let provided_normalized = provided +// .iter() +// .map(|e| normalize_sort_requirement(e.clone(), eq_classes, ordering_eq_classes)) +// .collect::>(); +// if required_normalized.len() > provided_normalized.len() { +// return false; +// } +// required_normalized +// .into_iter() +// .zip(provided_normalized) +// .all(|(req, given)| given.compatible(&req)) +// } + /// Checks whether the given [`PhysicalSortRequirement`]s are equal or more /// specific than the provided [`PhysicalSortRequirement`]s. fn requirements_compatible_concrete< - F: FnOnce() -> OrderingEquivalenceProperties, + F: FnOnce() -> OrderingEquivalenceProperties2, F2: FnOnce() -> EquivalenceProperties, >( provided: &[PhysicalSortRequirement], @@ -570,18 +758,11 @@ fn requirements_compatible_concrete< let ordering_eq_classes = oeq_properties.classes(); let eq_properties = equal_properties(); let eq_classes = eq_properties.classes(); - let mut required_normalized = Vec::new(); - for req in required { - let item = - normalize_sort_requirement(req.clone(), eq_classes, ordering_eq_classes); - if !required_normalized.contains(&item) { - required_normalized.push(item); - } - } - let provided_normalized = provided - .iter() - .map(|e| normalize_sort_requirement(e.clone(), eq_classes, ordering_eq_classes)) - .collect::>(); + + let mut required_normalized = + normalize_sort_requirements2(required, eq_classes, ordering_eq_classes); + let mut provided_normalized = + normalize_sort_requirements2(provided, eq_classes, ordering_eq_classes); if required_normalized.len() > provided_normalized.len() { return false; } @@ -1378,11 +1559,42 @@ mod tests { // (vec![(col_d, option2), (col_b, option1)], false), // (vec![(col_e, option2), (col_b, option1)], true), // (vec![(col_e, option1), (col_b, option1)], false), - (vec![(col_d, option1), (col_b, option1), (col_d, option1), (col_b, option1)], true), - (vec![(col_d, option1), (col_b, option1), (col_e, option2), (col_b, option1)], true), - (vec![(col_d, option1), (col_b, option1), (col_d, option2), (col_b, option1)], false), - (vec![(col_d, option1), (col_b, option1), (col_e, option1), (col_b, option1)], false), - + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_d, option1), + (col_b, option1), + ], + true, + ), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_e, option2), + (col_b, option1), + ], + true, + ), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_d, option2), + (col_b, option1), + ], + false, + ), + ( + vec![ + (col_d, option1), + (col_b, option1), + (col_e, option1), + (col_b, option1), + ], + false, + ), // (vec![(col_d, option1)], true), // (vec![(col_d, option2)], false), // (vec![(col_e, option2)], true), @@ -1753,11 +1965,20 @@ mod tests { #[test] fn test_get_range_inside() -> Result<()> { let empty_vec: Vec> = Vec::new(); - assert_eq!(get_ranges_inside2(&[1,2,3], &[1,2]), vec![Range{start: 0, end: 2}]); - assert_eq!(get_ranges_inside2(&[1,2,3], &[2,3]), vec![Range{start: 1, end: 3}]); - assert_eq!(get_ranges_inside2(&[1,2,3], &[1,3]), empty_vec); - assert_eq!(get_ranges_inside2(&[1,2,3], &[1, 2,3]), vec![Range{start: 0, end: 3}]); - assert_eq!(get_ranges_inside2(&[1,2,3], &[3, 2]), empty_vec); + assert_eq!( + get_ranges_inside2(&[1, 2, 3], &[1, 2]), + vec![Range { start: 0, end: 2 }] + ); + assert_eq!( + get_ranges_inside2(&[1, 2, 3], &[2, 3]), + vec![Range { start: 1, end: 3 }] + ); + assert_eq!(get_ranges_inside2(&[1, 2, 3], &[1, 3]), empty_vec); + assert_eq!( + get_ranges_inside2(&[1, 2, 3], &[1, 2, 3]), + vec![Range { start: 0, end: 3 }] + ); + assert_eq!(get_ranges_inside2(&[1, 2, 3], &[3, 2]), empty_vec); Ok(()) } From 4689c7bbc83faef2b8a14fac80f3c462a310cd7b Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 14:59:36 +0300 Subject: [PATCH 03/23] Simplifications --- datafusion/physical-expr/src/utils.rs | 93 ++++----------------------- 1 file changed, 13 insertions(+), 80 deletions(-) diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 5de7bf57c5d8b..af35502ce6e74 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -1438,13 +1438,13 @@ mod tests { finer, crude, || { EquivalenceProperties::new(empty_schema.clone()) }, - || { OrderingEquivalenceProperties::new(empty_schema.clone()) }, + || { OrderingEquivalenceProperties2::new(empty_schema.clone()) }, )); assert!(!ordering_satisfy( crude, finer, || { EquivalenceProperties::new(empty_schema.clone()) }, - || { OrderingEquivalenceProperties::new(empty_schema.clone()) }, + || { OrderingEquivalenceProperties2::new(empty_schema.clone()) }, )); Ok(()) } @@ -1476,7 +1476,8 @@ mod tests { }, ]; let provided = Some(&provided[..]); - let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; + let (_test_schema, eq_properties, ordering_eq_properties) = + create_test_params2()?; // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function let requirements = vec![ // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it @@ -1486,79 +1487,11 @@ mod tests { (vec![(col_c, option1)], true), (vec![(col_c, option2)], false), // Test whether ordering equivalence works as expected - (vec![(col_d, option1)], true), - (vec![(col_d, option2)], false), - (vec![(col_e, option2)], true), - (vec![(col_e, option1)], false), - ]; - for (cols, expected) in requirements { - let err_msg = format!("Error in test case:{cols:?}"); - let required = cols - .into_iter() - .map(|(col, options)| PhysicalSortExpr { - expr: Arc::new(col.clone()), - options, - }) - .collect::>(); - - let required = Some(&required[..]); - assert_eq!( - ordering_satisfy( - provided, - required, - || eq_properties.clone(), - || ordering_eq_properties.clone(), - ), - expected, - "{err_msg}" - ); - } - Ok(()) - } - - #[test] - fn test_ordering_satisfy_with_equivalence2() -> Result<()> { - let col_a = &Column::new("a", 0); - let col_b = &Column::new("b", 1); - let col_c = &Column::new("c", 2); - let col_d = &Column::new("d", 3); - let col_e = &Column::new("e", 4); - let option1 = SortOptions { - descending: false, - nulls_first: false, - }; - let option2 = SortOptions { - descending: true, - nulls_first: true, - }; - // The schema is ordered by a ASC NULLS LAST, b ASC NULLS LAST - let provided = vec![ - PhysicalSortExpr { - expr: Arc::new(col_a.clone()), - options: option1, - }, - PhysicalSortExpr { - expr: Arc::new(col_b.clone()), - options: option1, - }, - ]; - let provided = Some(&provided[..]); - let (_test_schema, eq_properties, ordering_eq_properties) = - create_test_params2()?; - // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function - let requirements = vec![ - // // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it - // (vec![(col_a, option1)], true), - // (vec![(col_a, option2)], false), - // // Test whether equivalence works as expected - // (vec![(col_c, option1)], true), - // (vec![(col_c, option2)], false), - // // Test whether ordering equivalence works as expected - // (vec![(col_d, option1)], false), - // (vec![(col_d, option1), (col_b, option1)], true), - // (vec![(col_d, option2), (col_b, option1)], false), - // (vec![(col_e, option2), (col_b, option1)], true), - // (vec![(col_e, option1), (col_b, option1)], false), + (vec![(col_d, option1)], false), + (vec![(col_d, option1), (col_b, option1)], true), + (vec![(col_d, option2), (col_b, option1)], false), + (vec![(col_e, option2), (col_b, option1)], true), + (vec![(col_e, option1), (col_b, option1)], false), ( vec![ (col_d, option1), @@ -1612,7 +1545,7 @@ mod tests { let required = Some(&required[..]); assert_eq!( - ordering_satisfy2( + ordering_satisfy( provided, required, || eq_properties.clone(), @@ -1894,10 +1827,10 @@ mod tests { eq_properties.add_equal_conditions((col_a, col_c)); // Column a and e are ordering equivalent (e.g global ordering of the table can be described both as a ASC and e ASC.) - let mut ordering_eq_properties = OrderingEquivalenceProperties::new(test_schema); + let mut ordering_eq_properties = OrderingEquivalenceProperties2::new(test_schema); ordering_eq_properties.add_equal_conditions(( - &OrderedColumn::new(col_a.clone(), option1), - &OrderedColumn::new(col_e.clone(), option1), + &vec![OrderedColumn::new(col_a.clone(), option1)], + &vec![OrderedColumn::new(col_e.clone(), option1)], )); let sort_req_a = PhysicalSortExpr { expr: Arc::new((col_a).clone()) as _, From 63e7fee0859747c06cf009b3614bd9d8f901bd67 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 15:02:52 +0300 Subject: [PATCH 04/23] simplifications --- datafusion/physical-expr/src/utils.rs | 219 +------------------------- 1 file changed, 8 insertions(+), 211 deletions(-) diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index af35502ce6e74..9a9d9cc7744ee 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -266,24 +266,7 @@ pub fn normalize_sort_expr( ) } -// fn get_ranges_inside(to_search: &[PhysicalSortExpr], section: &[PhysicalSortExpr]) -> Vec> { -// let n_section = section.len(); -// let n_end = if to_search.len() >= n_section { -// to_search.len() - n_section + 1 -// } else { -// 0 -// }; -// let mut res = vec![]; -// for idx in 0..n_end { -// let end = idx + n_section; -// if to_search[idx..end].eq(section){ -// res.push(Range{start:idx, end}); -// } -// } -// res -// } - -fn get_ranges_inside2(to_search: &[T], section: &[T]) -> Vec> { +fn get_ranges_inside(to_search: &[T], section: &[T]) -> Vec> { let n_section = section.len(); let n_end = if to_search.len() >= n_section { to_search.len() - n_section + 1 @@ -300,17 +283,6 @@ fn get_ranges_inside2(to_search: &[T], section: &[T]) -> Vec(to_search: &[T], section: &[T]) -> Option> { -// let n_section = section.len(); -// for idx in 0..to_search.len() - n_section + 1{ -// let end = idx + n_section; -// if to_search[idx..end].eq(section){ -// return Some(Range{start:idx, end}); -// } -// } -// None -// } - fn collapse_vec(in_data: Vec) -> Vec { let mut out_data = vec![]; for elem in in_data { @@ -344,7 +316,7 @@ pub fn normalize_sort_expr2( .collect::>(); println!("normalized_exprs: {:?}", normalized_exprs); println!("elem: {:?}", elem); - let ranges = get_ranges_inside2(&normalized_exprs, &elem); + let ranges = get_ranges_inside(&normalized_exprs, &elem); let mut offset: i64 = 0; for Range { start, end } in ranges { println!("start:{:?}, end:{:?}", start, end); @@ -396,7 +368,7 @@ pub fn normalize_sort_requirements2( .collect::>(); println!("normalized_exprs: {:?}", normalized_exprs); println!("elem: {:?}", elem); - let ranges = get_ranges_inside2(&normalized_exprs, &elem); + let ranges = get_ranges_inside(&normalized_exprs, &elem); let mut offset: i64 = 0; for Range { start, end } in ranges { println!("start:{:?}, end:{:?}", start, end); @@ -440,28 +412,6 @@ pub fn normalize_sort_requirement( ) } -// /// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. -// pub fn ordering_satisfy< -// F: FnOnce() -> EquivalenceProperties, -// F2: FnOnce() -> OrderingEquivalenceProperties, -// >( -// provided: Option<&[PhysicalSortExpr]>, -// required: Option<&[PhysicalSortExpr]>, -// equal_properties: F, -// ordering_equal_properties: F2, -// ) -> bool { -// match (provided, required) { -// (_, None) => true, -// (None, Some(_)) => false, -// (Some(provided), Some(required)) => ordering_satisfy_concrete( -// provided, -// required, -// equal_properties, -// ordering_equal_properties, -// ), -// } -// } - /// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. pub fn ordering_satisfy< F: FnOnce() -> EquivalenceProperties, @@ -484,41 +434,6 @@ pub fn ordering_satisfy< } } -// /// Checks whether the required [`PhysicalSortExpr`]s are satisfied by the -// /// provided [`PhysicalSortExpr`]s. -// pub fn ordering_satisfy_concrete< -// F: FnOnce() -> EquivalenceProperties, -// F2: FnOnce() -> OrderingEquivalenceProperties, -// >( -// provided: &[PhysicalSortExpr], -// required: &[PhysicalSortExpr], -// equal_properties: F, -// ordering_equal_properties: F2, -// ) -> bool { -// let oeq_properties = ordering_equal_properties(); -// let ordering_eq_classes = oeq_properties.classes(); -// let eq_properties = equal_properties(); -// let eq_classes = eq_properties.classes(); -// let mut required_normalized = Vec::new(); -// for expr in required { -// let item = normalize_sort_expr(expr.clone(), eq_classes, ordering_eq_classes); -// if !required_normalized.contains(&item) { -// required_normalized.push(item); -// } -// } -// let provided_normalized = provided -// .iter() -// .map(|e| normalize_sort_expr(e.clone(), eq_classes, ordering_eq_classes)) -// .collect::>(); -// if required_normalized.len() > provided_normalized.len() { -// return false; -// } -// required_normalized -// .into_iter() -// .zip(provided_normalized) -// .all(|(req, given)| given == req) -// } - /// Checks whether the required [`PhysicalSortExpr`]s are satisfied by the /// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_concrete< @@ -551,29 +466,6 @@ pub fn ordering_satisfy_concrete< .all(|(req, given)| given == req) } -// /// Checks whether the given [`PhysicalSortRequirement`]s are satisfied by the -// /// provided [`PhysicalSortExpr`]s. -// pub fn ordering_satisfy_requirement< -// F: FnOnce() -> EquivalenceProperties, -// F2: FnOnce() -> OrderingEquivalenceProperties, -// >( -// provided: Option<&[PhysicalSortExpr]>, -// required: Option<&[PhysicalSortRequirement]>, -// equal_properties: F, -// ordering_equal_properties: F2, -// ) -> bool { -// match (provided, required) { -// (_, None) => true, -// (None, Some(_)) => false, -// (Some(provided), Some(required)) => ordering_satisfy_requirement_concrete( -// provided, -// required, -// equal_properties, -// ordering_equal_properties, -// ), -// } -// } - /// Checks whether the given [`PhysicalSortRequirement`]s are satisfied by the /// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_requirement< @@ -597,42 +489,6 @@ pub fn ordering_satisfy_requirement< } } -// /// Checks whether the given [`PhysicalSortRequirement`]s are satisfied by the -// /// provided [`PhysicalSortExpr`]s. -// pub fn ordering_satisfy_requirement_concrete< -// F: FnOnce() -> EquivalenceProperties, -// F2: FnOnce() -> OrderingEquivalenceProperties, -// >( -// provided: &[PhysicalSortExpr], -// required: &[PhysicalSortRequirement], -// equal_properties: F, -// ordering_equal_properties: F2, -// ) -> bool { -// let oeq_properties = ordering_equal_properties(); -// let ordering_eq_classes = oeq_properties.classes(); -// let eq_properties = equal_properties(); -// let eq_classes = eq_properties.classes(); -// let mut required_normalized = Vec::new(); -// for req in required { -// let item = -// normalize_sort_requirement(req.clone(), eq_classes, ordering_eq_classes); -// if !required_normalized.contains(&item) { -// required_normalized.push(item); -// } -// } -// let provided_normalized = provided -// .iter() -// .map(|e| normalize_sort_expr(e.clone(), eq_classes, ordering_eq_classes)) -// .collect::>(); -// if required_normalized.len() > provided_normalized.len() { -// return false; -// } -// required_normalized -// .into_iter() -// .zip(provided_normalized) -// .all(|(req, given)| given.satisfy(&req)) -// } - /// Checks whether the given [`PhysicalSortRequirement`]s are satisfied by the /// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_requirement_concrete< @@ -661,29 +517,6 @@ pub fn ordering_satisfy_requirement_concrete< .all(|(req, given)| given.satisfy(&req)) } -// /// Checks whether the given [`PhysicalSortRequirement`]s are equal or more -// /// specific than the provided [`PhysicalSortRequirement`]s. -// pub fn requirements_compatible< -// F: FnOnce() -> OrderingEquivalenceProperties, -// F2: FnOnce() -> EquivalenceProperties, -// >( -// provided: Option<&[PhysicalSortRequirement]>, -// required: Option<&[PhysicalSortRequirement]>, -// ordering_equal_properties: F, -// equal_properties: F2, -// ) -> bool { -// match (provided, required) { -// (_, None) => true, -// (None, Some(_)) => false, -// (Some(provided), Some(required)) => requirements_compatible_concrete( -// provided, -// required, -// ordering_equal_properties, -// equal_properties, -// ), -// } -// } - /// Checks whether the given [`PhysicalSortRequirement`]s are equal or more /// specific than the provided [`PhysicalSortRequirement`]s. pub fn requirements_compatible< @@ -707,42 +540,6 @@ pub fn requirements_compatible< } } -// /// Checks whether the given [`PhysicalSortRequirement`]s are equal or more -// /// specific than the provided [`PhysicalSortRequirement`]s. -// fn requirements_compatible_concrete< -// F: FnOnce() -> OrderingEquivalenceProperties, -// F2: FnOnce() -> EquivalenceProperties, -// >( -// provided: &[PhysicalSortRequirement], -// required: &[PhysicalSortRequirement], -// ordering_equal_properties: F, -// equal_properties: F2, -// ) -> bool { -// let oeq_properties = ordering_equal_properties(); -// let ordering_eq_classes = oeq_properties.classes(); -// let eq_properties = equal_properties(); -// let eq_classes = eq_properties.classes(); -// let mut required_normalized = Vec::new(); -// for req in required { -// let item = -// normalize_sort_requirement(req.clone(), eq_classes, ordering_eq_classes); -// if !required_normalized.contains(&item) { -// required_normalized.push(item); -// } -// } -// let provided_normalized = provided -// .iter() -// .map(|e| normalize_sort_requirement(e.clone(), eq_classes, ordering_eq_classes)) -// .collect::>(); -// if required_normalized.len() > provided_normalized.len() { -// return false; -// } -// required_normalized -// .into_iter() -// .zip(provided_normalized) -// .all(|(req, given)| given.compatible(&req)) -// } - /// Checks whether the given [`PhysicalSortRequirement`]s are equal or more /// specific than the provided [`PhysicalSortRequirement`]s. fn requirements_compatible_concrete< @@ -1899,19 +1696,19 @@ mod tests { fn test_get_range_inside() -> Result<()> { let empty_vec: Vec> = Vec::new(); assert_eq!( - get_ranges_inside2(&[1, 2, 3], &[1, 2]), + get_ranges_inside(&[1, 2, 3], &[1, 2]), vec![Range { start: 0, end: 2 }] ); assert_eq!( - get_ranges_inside2(&[1, 2, 3], &[2, 3]), + get_ranges_inside(&[1, 2, 3], &[2, 3]), vec![Range { start: 1, end: 3 }] ); - assert_eq!(get_ranges_inside2(&[1, 2, 3], &[1, 3]), empty_vec); + assert_eq!(get_ranges_inside(&[1, 2, 3], &[1, 3]), empty_vec); assert_eq!( - get_ranges_inside2(&[1, 2, 3], &[1, 2, 3]), + get_ranges_inside(&[1, 2, 3], &[1, 2, 3]), vec![Range { start: 0, end: 3 }] ); - assert_eq!(get_ranges_inside2(&[1, 2, 3], &[3, 2]), empty_vec); + assert_eq!(get_ranges_inside(&[1, 2, 3], &[3, 2]), empty_vec); Ok(()) } From 2d7be1317613e3139bdec65593172f6dc09ff4b9 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 15:23:40 +0300 Subject: [PATCH 05/23] Remove unnecessary codes --- .../core/src/physical_plan/aggregates/mod.rs | 6 +- datafusion/core/src/physical_plan/mod.rs | 8 +- .../core/src/physical_plan/projection.rs | 9 +- .../windows/bounded_window_agg_exec.rs | 3 +- .../core/src/physical_plan/windows/mod.rs | 5 +- .../physical_plan/windows/window_agg_exec.rs | 3 +- datafusion/physical-expr/src/equivalence.rs | 149 ++----- datafusion/physical-expr/src/utils.rs | 370 +++++++++--------- 8 files changed, 220 insertions(+), 333 deletions(-) diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/core/src/physical_plan/aggregates/mod.rs index 4a66d06ce180c..876e2d77e45be 100644 --- a/datafusion/core/src/physical_plan/aggregates/mod.rs +++ b/datafusion/core/src/physical_plan/aggregates/mod.rs @@ -55,7 +55,6 @@ mod row_hash; mod utils; pub use datafusion_expr::AggregateFunction; -use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties2; pub use datafusion_physical_expr::expressions::create_aggregate_expr; /// Hash aggregate modes @@ -347,7 +346,7 @@ fn output_group_expr_helper(group_by: &PhysicalGroupBy) -> Vec EquivalenceProperties, - F2: Fn() -> OrderingEquivalenceProperties2, + F2: Fn() -> OrderingEquivalenceProperties, >( order_by_expr: &[Option>], eq_properties: F, @@ -1075,7 +1074,6 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::{DataFusionError, Result, ScalarValue}; - use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties2; use datafusion_physical_expr::expressions::{ lit, ApproxDistinct, Column, Count, Median, }; @@ -1736,7 +1734,7 @@ mod tests { let col_c = Column::new("c", 2); let col_d = Column::new("d", 3); eq_properties.add_equal_conditions((&col_a, &col_b)); - let mut ordering_eq_properties = OrderingEquivalenceProperties2::new(test_schema); + let mut ordering_eq_properties = OrderingEquivalenceProperties::new(test_schema); ordering_eq_properties.add_equal_conditions(( &vec![OrderedColumn::new(col_a.clone(), options1)], &vec![OrderedColumn::new(col_c.clone(), options2)], diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/core/src/physical_plan/mod.rs index dac8774d7e786..c8edf701cf05c 100644 --- a/datafusion/core/src/physical_plan/mod.rs +++ b/datafusion/core/src/physical_plan/mod.rs @@ -32,9 +32,7 @@ use arrow::record_batch::RecordBatch; pub use datafusion_expr::Accumulator; pub use datafusion_expr::ColumnarValue; pub use datafusion_physical_expr::aggregate::row_accumulator::RowAccumulator; -use datafusion_physical_expr::equivalence::{ - OrderingEquivalenceProperties, OrderingEquivalenceProperties2, -}; +use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties; pub use display::DisplayFormatType; use futures::stream::{Stream, TryStreamExt}; use std::fmt; @@ -191,8 +189,8 @@ pub trait ExecutionPlan: Debug + Send + Sync { } /// Get the OrderingEquivalenceProperties within the plan - fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties2 { - OrderingEquivalenceProperties2::new(self.schema()) + fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { + OrderingEquivalenceProperties::new(self.schema()) } /// Get a list of child execution plans that provide the input for this plan. The returned list diff --git a/datafusion/core/src/physical_plan/projection.rs b/datafusion/core/src/physical_plan/projection.rs index 17ca50ce1c9ba..f2775079fc4a4 100644 --- a/datafusion/core/src/physical_plan/projection.rs +++ b/datafusion/core/src/physical_plan/projection.rs @@ -41,9 +41,6 @@ use super::expressions::{Column, PhysicalSortExpr}; use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use super::{RecordBatchStream, SendableRecordBatchStream, Statistics}; -use datafusion_physical_expr::equivalence::{ - project_ordering_equivalence_properties2, OrderingEquivalenceProperties2, -}; use datafusion_physical_expr::{ normalize_out_expr_with_columns_map, project_equivalence_properties, project_ordering_equivalence_properties, OrderingEquivalenceProperties, @@ -219,9 +216,9 @@ impl ExecutionPlan for ProjectionExec { new_properties } - fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties2 { - let mut new_properties = OrderingEquivalenceProperties2::new(self.schema()); - project_ordering_equivalence_properties2( + fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { + let mut new_properties = OrderingEquivalenceProperties::new(self.schema()); + project_ordering_equivalence_properties( self.input.ordering_equivalence_properties(), &self.columns_map, &mut new_properties, diff --git a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs index f63aae5d1c9fa..95b482ef24963 100644 --- a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs @@ -60,7 +60,6 @@ use datafusion_common::utils::{ }; use datafusion_common::DataFusionError; use datafusion_expr::ColumnarValue; -use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties2; use datafusion_physical_expr::hash_utils::create_hashes; use datafusion_physical_expr::window::{ PartitionBatchState, PartitionBatches, PartitionKey, PartitionWindowAggStates, @@ -262,7 +261,7 @@ impl ExecutionPlan for BoundedWindowAggExec { } /// Get the OrderingEquivalenceProperties within the plan - fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties2 { + fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { window_ordering_equivalence(&self.schema, &self.input, &self.window_expr) } diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/core/src/physical_plan/windows/mod.rs index 01d6c52e238f9..e66beef6fc491 100644 --- a/datafusion/core/src/physical_plan/windows/mod.rs +++ b/datafusion/core/src/physical_plan/windows/mod.rs @@ -47,7 +47,6 @@ mod window_agg_exec; pub use bounded_window_agg_exec::BoundedWindowAggExec; pub use bounded_window_agg_exec::PartitionSearchMode; use datafusion_common::utils::longest_consecutive_prefix; -use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties2; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::utils::{convert_to_expr, get_indices_of_matching_exprs}; pub use datafusion_physical_expr::window::{ @@ -252,10 +251,10 @@ pub(crate) fn window_ordering_equivalence( schema: &SchemaRef, input: &Arc, window_expr: &[Arc], -) -> OrderingEquivalenceProperties2 { +) -> OrderingEquivalenceProperties { // We need to update the schema, so we can not directly use // `input.ordering_equivalence_properties()`. - let mut result = OrderingEquivalenceProperties2::new(schema.clone()); + let mut result = OrderingEquivalenceProperties::new(schema.clone()); result.extend( input .ordering_equivalence_properties() diff --git a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs index f3f416bd7ae0c..dc0302d77b983 100644 --- a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs +++ b/datafusion/core/src/physical_plan/windows/window_agg_exec.rs @@ -42,7 +42,6 @@ use arrow::{ }; use datafusion_common::utils::{evaluate_partition_ranges, get_at_indices}; use datafusion_common::DataFusionError; -use datafusion_physical_expr::equivalence::OrderingEquivalenceProperties2; use datafusion_physical_expr::{OrderingEquivalenceProperties, PhysicalSortRequirement}; use futures::stream::Stream; use futures::{ready, StreamExt}; @@ -193,7 +192,7 @@ impl ExecutionPlan for WindowAggExec { } /// Get the OrderingEquivalenceProperties within the plan - fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties2 { + fn ordering_equivalence_properties(&self) -> OrderingEquivalenceProperties { window_ordering_equivalence(&self.schema, &self.input, &self.window_expr) } diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 4dd04a9160236..f9185837c18fd 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -119,8 +119,7 @@ impl EquivalenceProperties { /// where both `a ASC` and `b DESC` can describe the table ordering. With /// `OrderingEquivalenceProperties`, we can keep track of these equivalences /// and treat `a ASC` and `b DESC` as the same ordering requirement. -pub type OrderingEquivalenceProperties = EquivalenceProperties; -pub type OrderingEquivalenceProperties2 = EquivalenceProperties>; +pub type OrderingEquivalenceProperties = EquivalenceProperties>; /// EquivalentClass is a set of [`Column`]s or [`OrderedColumn`]s that are known /// to have the same value in all tuples in a relation. `EquivalentClass` @@ -188,67 +187,6 @@ impl EquivalentClass { } } -#[derive(Debug, Clone)] -pub struct EquivalentClass2> { - /// First element in the EquivalentClass - head: T, - /// Other equal columns - others: HashSet, -} - -impl EquivalentClass2 { - pub fn new(head: T, others: Vec) -> EquivalentClass2 { - EquivalentClass2 { - head, - others: HashSet::from_iter(others), - } - } - - pub fn head(&self) -> &T { - &self.head - } - - pub fn others(&self) -> &HashSet { - &self.others - } - - pub fn contains(&self, col: &T) -> bool { - self.head == *col || self.others.contains(col) - } - - pub fn insert(&mut self, col: T) -> bool { - self.head != col && self.others.insert(col) - } - - pub fn remove(&mut self, col: &T) -> bool { - let removed = self.others.remove(col); - if !removed && *col == self.head { - let one_col = self.others.iter().next().cloned(); - if let Some(col) = one_col { - let removed = self.others.remove(&col); - self.head = col; - removed - } else { - false - } - } else { - removed - } - } - - pub fn iter(&self) -> impl Iterator { - std::iter::once(&self.head).chain(self.others.iter()) - } - - pub fn len(&self) -> usize { - self.others.len() + 1 - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } -} - /// This object represents a [`Column`] with a definite ordering. #[derive(Debug, Hash, PartialEq, Eq, Clone)] pub struct OrderedColumn { @@ -296,50 +234,9 @@ impl ColumnAccessor for OrderedColumn { } } -pub type OrderingEquivalentClass = EquivalentClass; -pub type OrderingEquivalentClass2 = EquivalentClass>; +pub type OrderingEquivalentClass = EquivalentClass>; impl OrderingEquivalentClass { - /// Finds the matching column inside the `OrderingEquivalentClass`. - fn get_matching_column(&self, column: &Column) -> Option { - if self.head.col.eq(column) { - Some(self.head.clone()) - } else { - for item in &self.others { - if item.col.eq(column) { - return Some(item.clone()); - } - } - None - } - } - - fn update_with_aliases(&mut self, columns_map: &HashMap>) { - for (column, columns) in columns_map { - if self.head.col.eq(column) { - for col in columns { - self.insert(OrderedColumn { - col: col.clone(), - options: self.head.options, - }); - } - } else { - for item in self.others.clone() { - if item.col.eq(column) { - for col in columns { - self.insert(OrderedColumn { - col: col.clone(), - options: item.options, - }); - } - } - } - } - } - } -} - -impl OrderingEquivalentClass2 { fn update_with_aliases(&mut self, columns_map: &HashMap>) { for (column, columns) in columns_map { for ordering in vec![self.head.clone()] @@ -394,6 +291,27 @@ pub fn project_equivalence_properties( output_eq.extend(ec_classes); } +// /// This function applies the given projection to the given ordering +// /// equivalence properties to compute the resulting (projected) ordering +// /// equivalence properties; e.g. +// /// 1) Adding an alias, which can introduce additional ordering equivalence +// /// properties, as in Projection(a, a as a1, a as a2) extends global ordering +// /// of a to a1 and a2. +// /// 2) Truncate the [`OrderingEquivalentClass`]es that are not in the output schema. +// pub fn project_ordering_equivalence_properties( +// input_eq: OrderingEquivalenceProperties, +// columns_map: &HashMap>, +// output_eq: &mut OrderingEquivalenceProperties, +// ) { +// let mut ec_classes = input_eq.classes().to_vec(); +// for class in ec_classes.iter_mut() { +// class.update_with_aliases(columns_map); +// } +// +// prune_columns_to_remove(output_eq, &mut ec_classes); +// output_eq.extend(ec_classes); +// } + /// This function applies the given projection to the given ordering /// equivalence properties to compute the resulting (projected) ordering /// equivalence properties; e.g. @@ -411,27 +329,6 @@ pub fn project_ordering_equivalence_properties( class.update_with_aliases(columns_map); } - prune_columns_to_remove(output_eq, &mut ec_classes); - output_eq.extend(ec_classes); -} - -/// This function applies the given projection to the given ordering -/// equivalence properties to compute the resulting (projected) ordering -/// equivalence properties; e.g. -/// 1) Adding an alias, which can introduce additional ordering equivalence -/// properties, as in Projection(a, a as a1, a as a2) extends global ordering -/// of a to a1 and a2. -/// 2) Truncate the [`OrderingEquivalentClass`]es that are not in the output schema. -pub fn project_ordering_equivalence_properties2( - input_eq: OrderingEquivalenceProperties2, - columns_map: &HashMap>, - output_eq: &mut OrderingEquivalenceProperties2, -) { - let mut ec_classes = input_eq.classes().to_vec(); - for class in ec_classes.iter_mut() { - class.update_with_aliases(columns_map); - } - // prune_columns_to_remove(output_eq, &mut ec_classes); // TODO: Add pruning output_eq.extend(ec_classes); diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 9a9d9cc7744ee..5cf2a160a292f 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -17,7 +17,7 @@ use crate::equivalence::{ EquivalenceProperties, EquivalentClass, OrderedColumn, OrderingEquivalenceProperties, - OrderingEquivalenceProperties2, OrderingEquivalentClass, OrderingEquivalentClass2, + OrderingEquivalentClass, }; use crate::expressions::{BinaryExpr, Column, UnKnownColumn}; use crate::{PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement}; @@ -165,47 +165,47 @@ pub fn normalize_expr_with_equivalence_properties( .unwrap_or(expr) } -fn normalize_expr_with_ordering_equivalence_properties( - expr: Arc, - sort_options: SortOptions, - eq_properties: &[OrderingEquivalentClass], -) -> (Arc, SortOptions) { - let normalized_expr = expr - .clone() - .transform(&|expr| { - let normalized_form = - expr.as_any().downcast_ref::().and_then(|column| { - for class in eq_properties { - let ordered_column = OrderedColumn { - col: column.clone(), - options: sort_options, - }; - if class.contains(&ordered_column) { - return Some(class.head().clone()); - } - } - None - }); - Ok(if let Some(normalized_form) = normalized_form { - Transformed::Yes(Arc::new(normalized_form.col) as _) - } else { - Transformed::No(expr) - }) - }) - .unwrap_or_else(|_| expr.clone()); - if expr.ne(&normalized_expr) { - if let Some(col) = normalized_expr.as_any().downcast_ref::() { - for eq_class in eq_properties.iter() { - let head = eq_class.head(); - if head.col.eq(col) { - // Use options of the normalized version: - return (normalized_expr, head.options); - } - } - } - } - (expr, sort_options) -} +// fn normalize_expr_with_ordering_equivalence_properties( +// expr: Arc, +// sort_options: SortOptions, +// eq_properties: &[OrderingEquivalentClass], +// ) -> (Arc, SortOptions) { +// let normalized_expr = expr +// .clone() +// .transform(&|expr| { +// let normalized_form = +// expr.as_any().downcast_ref::().and_then(|column| { +// for class in eq_properties { +// let ordered_column = OrderedColumn { +// col: column.clone(), +// options: sort_options, +// }; +// if class.contains(&ordered_column) { +// return Some(class.head().clone()); +// } +// } +// None +// }); +// Ok(if let Some(normalized_form) = normalized_form { +// Transformed::Yes(Arc::new(normalized_form.col) as _) +// } else { +// Transformed::No(expr) +// }) +// }) +// .unwrap_or_else(|_| expr.clone()); +// if expr.ne(&normalized_expr) { +// if let Some(col) = normalized_expr.as_any().downcast_ref::() { +// for eq_class in eq_properties.iter() { +// let head = eq_class.head(); +// if head.col.eq(col) { +// // Use options of the normalized version: +// return (normalized_expr, head.options); +// } +// } +// } +// } +// (expr, sort_options) +// } fn normalize_sort_expr_with_equivalence_properties( mut sort_expr: PhysicalSortExpr, @@ -216,18 +216,18 @@ fn normalize_sort_expr_with_equivalence_properties( sort_expr } -fn normalize_sort_expr_with_ordering_equivalence_properties( - mut sort_expr: PhysicalSortExpr, - eq_properties: &[OrderingEquivalentClass], -) -> PhysicalSortExpr { - (sort_expr.expr, sort_expr.options) = - normalize_expr_with_ordering_equivalence_properties( - sort_expr.expr.clone(), - sort_expr.options, - eq_properties, - ); - sort_expr -} +// fn normalize_sort_expr_with_ordering_equivalence_properties( +// mut sort_expr: PhysicalSortExpr, +// eq_properties: &[OrderingEquivalentClass], +// ) -> PhysicalSortExpr { +// (sort_expr.expr, sort_expr.options) = +// normalize_expr_with_ordering_equivalence_properties( +// sort_expr.expr.clone(), +// sort_expr.options, +// eq_properties, +// ); +// sort_expr +// } fn normalize_sort_requirement_with_equivalence_properties( mut sort_requirement: PhysicalSortRequirement, @@ -238,33 +238,33 @@ fn normalize_sort_requirement_with_equivalence_properties( sort_requirement } -fn normalize_sort_requirement_with_ordering_equivalence_properties( - mut sort_requirement: PhysicalSortRequirement, - eq_properties: &[OrderingEquivalentClass], -) -> PhysicalSortRequirement { - if let Some(options) = &mut sort_requirement.options { - (sort_requirement.expr, *options) = - normalize_expr_with_ordering_equivalence_properties( - sort_requirement.expr, - *options, - eq_properties, - ); - } - sort_requirement -} - -pub fn normalize_sort_expr( - sort_expr: PhysicalSortExpr, - eq_properties: &[EquivalentClass], - ordering_eq_properties: &[OrderingEquivalentClass], -) -> PhysicalSortExpr { - let normalized = - normalize_sort_expr_with_equivalence_properties(sort_expr, eq_properties); - normalize_sort_expr_with_ordering_equivalence_properties( - normalized, - ordering_eq_properties, - ) -} +// fn normalize_sort_requirement_with_ordering_equivalence_properties( +// mut sort_requirement: PhysicalSortRequirement, +// eq_properties: &[OrderingEquivalentClass], +// ) -> PhysicalSortRequirement { +// if let Some(options) = &mut sort_requirement.options { +// (sort_requirement.expr, *options) = +// normalize_expr_with_ordering_equivalence_properties( +// sort_requirement.expr, +// *options, +// eq_properties, +// ); +// } +// sort_requirement +// } + +// pub fn normalize_sort_expr( +// sort_expr: PhysicalSortExpr, +// eq_properties: &[EquivalentClass], +// ordering_eq_properties: &[OrderingEquivalentClass], +// ) -> PhysicalSortExpr { +// let normalized = +// normalize_sort_expr_with_equivalence_properties(sort_expr, eq_properties); +// normalize_sort_expr_with_ordering_equivalence_properties( +// normalized, +// ordering_eq_properties, +// ) +// } fn get_ranges_inside(to_search: &[T], section: &[T]) -> Vec> { let n_section = section.len(); @@ -296,7 +296,7 @@ fn collapse_vec(in_data: Vec) -> Vec { pub fn normalize_sort_expr2( sort_exprs: &[PhysicalSortExpr], eq_properties: &[EquivalentClass], - ordering_eq_properties: &[OrderingEquivalentClass2], + ordering_eq_properties: &[OrderingEquivalentClass], ) -> Vec { let mut normalized_exprs = sort_exprs .iter() @@ -348,7 +348,7 @@ pub fn normalize_sort_expr2( pub fn normalize_sort_requirements2( sort_exprs: &[PhysicalSortRequirement], eq_properties: &[EquivalentClass], - ordering_eq_properties: &[OrderingEquivalentClass2], + ordering_eq_properties: &[OrderingEquivalentClass], ) -> Vec { let mut normalized_exprs = sort_exprs .iter() @@ -397,25 +397,25 @@ pub fn normalize_sort_requirements2( collapse_vec(normalized_exprs) } -pub fn normalize_sort_requirement( - sort_requirement: PhysicalSortRequirement, - eq_properties: &[EquivalentClass], - ordering_eq_properties: &[OrderingEquivalentClass], -) -> PhysicalSortRequirement { - let normalized = normalize_sort_requirement_with_equivalence_properties( - sort_requirement, - eq_properties, - ); - normalize_sort_requirement_with_ordering_equivalence_properties( - normalized, - ordering_eq_properties, - ) -} +// pub fn normalize_sort_requirement( +// sort_requirement: PhysicalSortRequirement, +// eq_properties: &[EquivalentClass], +// ordering_eq_properties: &[OrderingEquivalentClass], +// ) -> PhysicalSortRequirement { +// let normalized = normalize_sort_requirement_with_equivalence_properties( +// sort_requirement, +// eq_properties, +// ); +// normalize_sort_requirement_with_ordering_equivalence_properties( +// normalized, +// ordering_eq_properties, +// ) +// } /// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. pub fn ordering_satisfy< F: FnOnce() -> EquivalenceProperties, - F2: FnOnce() -> OrderingEquivalenceProperties2, + F2: FnOnce() -> OrderingEquivalenceProperties, >( provided: Option<&[PhysicalSortExpr]>, required: Option<&[PhysicalSortExpr]>, @@ -438,7 +438,7 @@ pub fn ordering_satisfy< /// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_concrete< F: FnOnce() -> EquivalenceProperties, - F2: FnOnce() -> OrderingEquivalenceProperties2, + F2: FnOnce() -> OrderingEquivalenceProperties, >( provided: &[PhysicalSortExpr], required: &[PhysicalSortExpr], @@ -470,7 +470,7 @@ pub fn ordering_satisfy_concrete< /// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_requirement< F: FnOnce() -> EquivalenceProperties, - F2: FnOnce() -> OrderingEquivalenceProperties2, + F2: FnOnce() -> OrderingEquivalenceProperties, >( provided: Option<&[PhysicalSortExpr]>, required: Option<&[PhysicalSortRequirement]>, @@ -493,7 +493,7 @@ pub fn ordering_satisfy_requirement< /// provided [`PhysicalSortExpr`]s. pub fn ordering_satisfy_requirement_concrete< F: FnOnce() -> EquivalenceProperties, - F2: FnOnce() -> OrderingEquivalenceProperties2, + F2: FnOnce() -> OrderingEquivalenceProperties, >( provided: &[PhysicalSortExpr], required: &[PhysicalSortRequirement], @@ -520,7 +520,7 @@ pub fn ordering_satisfy_requirement_concrete< /// Checks whether the given [`PhysicalSortRequirement`]s are equal or more /// specific than the provided [`PhysicalSortRequirement`]s. pub fn requirements_compatible< - F: FnOnce() -> OrderingEquivalenceProperties2, + F: FnOnce() -> OrderingEquivalenceProperties, F2: FnOnce() -> EquivalenceProperties, >( provided: Option<&[PhysicalSortRequirement]>, @@ -543,7 +543,7 @@ pub fn requirements_compatible< /// Checks whether the given [`PhysicalSortRequirement`]s are equal or more /// specific than the provided [`PhysicalSortRequirement`]s. fn requirements_compatible_concrete< - F: FnOnce() -> OrderingEquivalenceProperties2, + F: FnOnce() -> OrderingEquivalenceProperties, F2: FnOnce() -> EquivalenceProperties, >( provided: &[PhysicalSortRequirement], @@ -827,7 +827,7 @@ mod tests { use datafusion_common::{Result, ScalarValue}; use std::fmt::{Display, Formatter}; - use crate::equivalence::OrderingEquivalenceProperties2; + use crate::equivalence::OrderingEquivalenceProperties; use arrow_schema::{DataType, Field, Schema}; use petgraph::visit::Bfs; use std::sync::Arc; @@ -910,12 +910,12 @@ mod tests { let mut ordering_eq_properties = OrderingEquivalenceProperties::new(test_schema.clone()); ordering_eq_properties.add_equal_conditions(( - &OrderedColumn::new(col_a.clone(), option1), - &OrderedColumn::new(col_d.clone(), option1), + &vec![OrderedColumn::new(col_a.clone(), option1)], + &vec![OrderedColumn::new(col_d.clone(), option1)], )); ordering_eq_properties.add_equal_conditions(( - &OrderedColumn::new(col_a.clone(), option1), - &OrderedColumn::new(col_e.clone(), option2), + &vec![OrderedColumn::new(col_a.clone(), option1)], + &vec![OrderedColumn::new(col_e.clone(), option2)], )); Ok((test_schema, eq_properties, ordering_eq_properties)) } @@ -923,7 +923,7 @@ mod tests { fn create_test_params2() -> Result<( SchemaRef, EquivalenceProperties, - OrderingEquivalenceProperties2, + OrderingEquivalenceProperties, )> { // Assume schema satisfies ordering a ASC NULLS LAST // and d ASC NULLS LAST, b ASC NULLS LAST and e DESC NULLS FIRST, b ASC NULLS LAST @@ -945,7 +945,7 @@ mod tests { let mut eq_properties = EquivalenceProperties::new(test_schema.clone()); eq_properties.add_equal_conditions((col_a, col_c)); let mut ordering_eq_properties = - OrderingEquivalenceProperties2::new(test_schema.clone()); + OrderingEquivalenceProperties::new(test_schema.clone()); ordering_eq_properties.add_equal_conditions(( &vec![OrderedColumn::new(col_a.clone(), option1)], &vec![ @@ -1235,13 +1235,13 @@ mod tests { finer, crude, || { EquivalenceProperties::new(empty_schema.clone()) }, - || { OrderingEquivalenceProperties2::new(empty_schema.clone()) }, + || { OrderingEquivalenceProperties::new(empty_schema.clone()) }, )); assert!(!ordering_satisfy( crude, finer, || { EquivalenceProperties::new(empty_schema.clone()) }, - || { OrderingEquivalenceProperties2::new(empty_schema.clone()) }, + || { OrderingEquivalenceProperties::new(empty_schema.clone()) }, )); Ok(()) } @@ -1429,26 +1429,26 @@ mod tests { ); } - // Test cases for ordering equivalence normalization - // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. - let expressions = vec![ - (&col_d_expr, option1, option1, &col_a_expr), - (&col_e_expr, option2, option1, &col_a_expr), - // Cannot normalize, hence should return itself. - (&col_e_expr, option1, option1, &col_e_expr), - ]; - for (expr, sort_options, expected_options, expected_ordering_eq) in expressions { - let (normalized_expr, options) = - normalize_expr_with_ordering_equivalence_properties( - expr.clone(), - sort_options, - ordering_eq_properties.classes(), - ); - assert!( - normalized_expr.eq(expected_ordering_eq) && (expected_options == options), - "error in test: expr: {expr:?}, sort_options: {sort_options:?}" - ); - } + // // Test cases for ordering equivalence normalization + // // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. + // let expressions = vec![ + // (&col_d_expr, option1, option1, &col_a_expr), + // (&col_e_expr, option2, option1, &col_a_expr), + // // Cannot normalize, hence should return itself. + // (&col_e_expr, option1, option1, &col_e_expr), + // ]; + // for (expr, sort_options, expected_options, expected_ordering_eq) in expressions { + // let (normalized_expr, options) = + // normalize_expr_with_ordering_equivalence_properties( + // expr.clone(), + // sort_options, + // ordering_eq_properties.classes(), + // ); + // assert!( + // normalized_expr.eq(expected_ordering_eq) && (expected_options == options), + // "error in test: expr: {expr:?}, sort_options: {sort_options:?}" + // ); + // } Ok(()) } @@ -1500,31 +1500,31 @@ mod tests { ); } - // Test cases for ordering equivalence normalization - // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. - let expressions = vec![ - (&col_d, option1, &col_a, option1), - (&col_e, option2, &col_a, option1), - ]; - for (expr, sort_options, expected_col, expected_options) in - expressions.into_iter() - { - let expected = PhysicalSortExpr { - expr: Arc::new((*expected_col).clone()) as _, - options: expected_options, - }; - let arg = PhysicalSortExpr { - expr: Arc::new((*expr).clone()) as _, - options: sort_options, - }; - assert!( - expected.eq(&normalize_sort_expr_with_ordering_equivalence_properties( - arg.clone(), - ordering_eq_properties.classes() - )), - "error in test: expr: {expr:?}, sort_options: {sort_options:?}" - ); - } + // // Test cases for ordering equivalence normalization + // // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. + // let expressions = vec![ + // (&col_d, option1, &col_a, option1), + // (&col_e, option2, &col_a, option1), + // ]; + // for (expr, sort_options, expected_col, expected_options) in + // expressions.into_iter() + // { + // let expected = PhysicalSortExpr { + // expr: Arc::new((*expected_col).clone()) as _, + // options: expected_options, + // }; + // let arg = PhysicalSortExpr { + // expr: Arc::new((*expr).clone()) as _, + // options: sort_options, + // }; + // assert!( + // expected.eq(&normalize_sort_expr_with_ordering_equivalence_properties( + // arg.clone(), + // ordering_eq_properties.classes() + // )), + // "error in test: expr: {expr:?}, sort_options: {sort_options:?}" + // ); + // } Ok(()) } @@ -1577,33 +1577,33 @@ mod tests { ); } - // Test cases for ordering equivalence normalization - // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. - let expressions = vec![ - (&col_d, Some(option1), &col_a, Some(option1)), - (&col_e, Some(option2), &col_a, Some(option1)), - ]; - for (expr, sort_options, expected_col, expected_options) in - expressions.into_iter() - { - let expected = PhysicalSortRequirement::new( - Arc::new((*expected_col).clone()) as _, - expected_options, - ); - let arg = PhysicalSortRequirement::new( - Arc::new((*expr).clone()) as _, - sort_options, - ); - assert!( - expected.eq( - &normalize_sort_requirement_with_ordering_equivalence_properties( - arg.clone(), - ordering_eq_properties.classes() - ) - ), - "error in test: expr: {expr:?}, sort_options: {sort_options:?}" - ); - } + // // Test cases for ordering equivalence normalization + // // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. + // let expressions = vec![ + // (&col_d, Some(option1), &col_a, Some(option1)), + // (&col_e, Some(option2), &col_a, Some(option1)), + // ]; + // for (expr, sort_options, expected_col, expected_options) in + // expressions.into_iter() + // { + // let expected = PhysicalSortRequirement::new( + // Arc::new((*expected_col).clone()) as _, + // expected_options, + // ); + // let arg = PhysicalSortRequirement::new( + // Arc::new((*expr).clone()) as _, + // sort_options, + // ); + // assert!( + // expected.eq( + // &normalize_sort_requirement_with_ordering_equivalence_properties( + // arg.clone(), + // ordering_eq_properties.classes() + // ) + // ), + // "error in test: expr: {expr:?}, sort_options: {sort_options:?}" + // ); + // } Ok(()) } @@ -1624,7 +1624,7 @@ mod tests { eq_properties.add_equal_conditions((col_a, col_c)); // Column a and e are ordering equivalent (e.g global ordering of the table can be described both as a ASC and e ASC.) - let mut ordering_eq_properties = OrderingEquivalenceProperties2::new(test_schema); + let mut ordering_eq_properties = OrderingEquivalenceProperties::new(test_schema); ordering_eq_properties.add_equal_conditions(( &vec![OrderedColumn::new(col_a.clone(), option1)], &vec![OrderedColumn::new(col_e.clone(), option1)], From 4d84ad6f4287e3e28a9d838cf8f24ec4918785ee Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 15:27:29 +0300 Subject: [PATCH 06/23] simplifications --- datafusion/physical-expr/src/utils.rs | 109 -------------------------- 1 file changed, 109 deletions(-) diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 5cf2a160a292f..1fcc4a19acfdc 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -165,48 +165,6 @@ pub fn normalize_expr_with_equivalence_properties( .unwrap_or(expr) } -// fn normalize_expr_with_ordering_equivalence_properties( -// expr: Arc, -// sort_options: SortOptions, -// eq_properties: &[OrderingEquivalentClass], -// ) -> (Arc, SortOptions) { -// let normalized_expr = expr -// .clone() -// .transform(&|expr| { -// let normalized_form = -// expr.as_any().downcast_ref::().and_then(|column| { -// for class in eq_properties { -// let ordered_column = OrderedColumn { -// col: column.clone(), -// options: sort_options, -// }; -// if class.contains(&ordered_column) { -// return Some(class.head().clone()); -// } -// } -// None -// }); -// Ok(if let Some(normalized_form) = normalized_form { -// Transformed::Yes(Arc::new(normalized_form.col) as _) -// } else { -// Transformed::No(expr) -// }) -// }) -// .unwrap_or_else(|_| expr.clone()); -// if expr.ne(&normalized_expr) { -// if let Some(col) = normalized_expr.as_any().downcast_ref::() { -// for eq_class in eq_properties.iter() { -// let head = eq_class.head(); -// if head.col.eq(col) { -// // Use options of the normalized version: -// return (normalized_expr, head.options); -// } -// } -// } -// } -// (expr, sort_options) -// } - fn normalize_sort_expr_with_equivalence_properties( mut sort_expr: PhysicalSortExpr, eq_properties: &[EquivalentClass], @@ -216,18 +174,6 @@ fn normalize_sort_expr_with_equivalence_properties( sort_expr } -// fn normalize_sort_expr_with_ordering_equivalence_properties( -// mut sort_expr: PhysicalSortExpr, -// eq_properties: &[OrderingEquivalentClass], -// ) -> PhysicalSortExpr { -// (sort_expr.expr, sort_expr.options) = -// normalize_expr_with_ordering_equivalence_properties( -// sort_expr.expr.clone(), -// sort_expr.options, -// eq_properties, -// ); -// sort_expr -// } fn normalize_sort_requirement_with_equivalence_properties( mut sort_requirement: PhysicalSortRequirement, @@ -238,34 +184,6 @@ fn normalize_sort_requirement_with_equivalence_properties( sort_requirement } -// fn normalize_sort_requirement_with_ordering_equivalence_properties( -// mut sort_requirement: PhysicalSortRequirement, -// eq_properties: &[OrderingEquivalentClass], -// ) -> PhysicalSortRequirement { -// if let Some(options) = &mut sort_requirement.options { -// (sort_requirement.expr, *options) = -// normalize_expr_with_ordering_equivalence_properties( -// sort_requirement.expr, -// *options, -// eq_properties, -// ); -// } -// sort_requirement -// } - -// pub fn normalize_sort_expr( -// sort_expr: PhysicalSortExpr, -// eq_properties: &[EquivalentClass], -// ordering_eq_properties: &[OrderingEquivalentClass], -// ) -> PhysicalSortExpr { -// let normalized = -// normalize_sort_expr_with_equivalence_properties(sort_expr, eq_properties); -// normalize_sort_expr_with_ordering_equivalence_properties( -// normalized, -// ordering_eq_properties, -// ) -// } - fn get_ranges_inside(to_search: &[T], section: &[T]) -> Vec> { let n_section = section.len(); let n_end = if to_search.len() >= n_section { @@ -1577,33 +1495,6 @@ mod tests { ); } - // // Test cases for ordering equivalence normalization - // // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. - // let expressions = vec![ - // (&col_d, Some(option1), &col_a, Some(option1)), - // (&col_e, Some(option2), &col_a, Some(option1)), - // ]; - // for (expr, sort_options, expected_col, expected_options) in - // expressions.into_iter() - // { - // let expected = PhysicalSortRequirement::new( - // Arc::new((*expected_col).clone()) as _, - // expected_options, - // ); - // let arg = PhysicalSortRequirement::new( - // Arc::new((*expr).clone()) as _, - // sort_options, - // ); - // assert!( - // expected.eq( - // &normalize_sort_requirement_with_ordering_equivalence_properties( - // arg.clone(), - // ordering_eq_properties.classes() - // ) - // ), - // "error in test: expr: {expr:?}, sort_options: {sort_options:?}" - // ); - // } Ok(()) } From fc014c4035ecaba7f8fff77de18696c26608621b Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 15:51:05 +0300 Subject: [PATCH 07/23] Add test cases --- .../physical_optimizer/sort_enforcement.rs | 139 +++++++++++++++++- .../core/src/physical_plan/projection.rs | 2 + .../core/src/physical_plan/windows/mod.rs | 1 + datafusion/physical-expr/src/equivalence.rs | 1 + 4 files changed, 142 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index f71c79e9fc829..6b34b575e1126 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -49,7 +49,7 @@ use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use crate::physical_plan::windows::{ BoundedWindowAggExec, PartitionSearchMode, WindowAggExec, }; -use crate::physical_plan::{with_new_children_if_necessary, Distribution, ExecutionPlan}; +use crate::physical_plan::{with_new_children_if_necessary, Distribution, ExecutionPlan, displayable}; use arrow::datatypes::SchemaRef; use datafusion_common::tree_node::{Transformed, TreeNode, VisitRecursion}; use datafusion_common::utils::{get_at_indices, longest_consecutive_prefix}; @@ -444,6 +444,13 @@ fn parallelize_sorts( })) } +fn print_plan(plan: &Arc) -> Result<()>{ + let formatted = displayable(plan.as_ref()).indent().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + println!("{:#?}", actual); + Ok(()) +} + /// This function enforces sorting requirements and makes optimizations without /// violating these requirements whenever possible. fn ensure_sorting( @@ -454,6 +461,10 @@ fn ensure_sorting( return Ok(Transformed::No(requirements)); } let plan = requirements.plan; + println!("--------------------------------------"); + print_plan(&plan)?; + println!("plan.output ordering:{:?}", plan.output_ordering()); + println!("--------------------------------------"); let mut children = plan.children(); let mut sort_onwards = requirements.sort_onwards; if let Some(result) = analyze_immediate_sort_removal(&plan, &sort_onwards) { @@ -2894,3 +2905,129 @@ mod tests { ) } } + + +mod tmp_tests{ + use tempfile::TempDir; + use datafusion_common::Result; + use datafusion_execution::config::SessionConfig; + use crate::assert_batches_eq; + use crate::physical_plan::{collect, displayable}; + use crate::prelude::SessionContext; + + #[tokio::test] + async fn test_source_rn_ordered() -> Result<()> { + let config = SessionConfig::new() + .with_target_partitions(1); + let ctx = SessionContext::with_config(config); + ctx.sql("CREATE UNBOUNDED EXTERNAL TABLE annotated_data_infinite ( + ts INTEGER, + inc_col INTEGER, + desc_col INTEGER, + ) + STORED AS CSV + WITH HEADER ROW + WITH ORDER (ts ASC) + LOCATION 'tests/data/window_1.csv'").await?; + + let sql = "SELECT ts, rn1 FROM (SELECT ts, inc_col, + ROW_NUMBER() OVER() as rn1 + FROM annotated_data_infinite + ORDER BY ts ASC) + ORDER BY rn1 + LIMIT 5"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let physical_plan = dataframe.create_physical_plan().await?; + let formatted = displayable(physical_plan.as_ref()).indent().to_string(); + let expected = { + vec![ + "GlobalLimitExec: skip=0, fetch=5", + " ProjectionExec: expr=[ts@0 as ts, ROW_NUMBER() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1]", + " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }], mode=[Sorted]", + ] + }; + + let actual: Vec<&str> = formatted.trim().lines().collect(); + let actual_len = actual.len(); + let actual_trim_last = &actual[..actual_len - 1]; + assert_eq!( + expected, actual_trim_last, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + let actual = collect(physical_plan, ctx.task_ctx()).await?; + let expected = vec![ + "+----+-----+", + "| ts | rn1 |", + "+----+-----+", + "| 1 | 1 |", + "| 1 | 2 |", + "| 5 | 3 |", + "| 9 | 4 |", + "| 10 | 5 |", + "+----+-----+", + ]; + assert_batches_eq!(expected, &actual); + Ok(()) + } + + #[tokio::test] + async fn test_source_rn_ordered2() -> Result<()> { + let config = SessionConfig::new() + .with_target_partitions(1); + let ctx = SessionContext::with_config(config); + ctx.sql("CREATE EXTERNAL TABLE annotated_data_finite ( + ts INTEGER, + inc_col INTEGER, + desc_col INTEGER, + ) + STORED AS CSV + WITH HEADER ROW + WITH ORDER (ts ASC) + LOCATION 'tests/data/window_1.csv'").await?; + + let sql = "SELECT ts, rn1 FROM (SELECT ts, inc_col, + ROW_NUMBER() OVER(ORDER BY ts DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) as rn1 + FROM annotated_data_finite + ORDER BY ts DESC) + ORDER BY rn1 ASC + LIMIT 5"; + + let msg = format!("Creating logical plan for '{sql}'"); + let dataframe = ctx.sql(sql).await.expect(&msg); + let physical_plan = dataframe.create_physical_plan().await?; + let formatted = displayable(physical_plan.as_ref()).indent().to_string(); + let expected = { + vec![ + "GlobalLimitExec: skip=0, fetch=5", + " ProjectionExec: expr=[ts@0 as ts, ROW_NUMBER() ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@1 as rn1]", + " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)) }], mode=[Sorted]", + " SortExec: expr=[ts@0 DESC]", + ] + }; + + let actual: Vec<&str> = formatted.trim().lines().collect(); + let actual_len = actual.len(); + let actual_trim_last = &actual[..actual_len - 1]; + assert_eq!( + expected, actual_trim_last, + "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + ); + + let actual = collect(physical_plan, ctx.task_ctx()).await?; + let expected = vec![ + "+-----+-----+", + "| ts | rn1 |", + "+-----+-----+", + "| 264 | 1 |", + "| 264 | 2 |", + "| 262 | 3 |", + "| 258 | 4 |", + "| 254 | 5 |", + "+-----+-----+", + ]; + assert_batches_eq!(expected, &actual); + Ok(()) + } +} diff --git a/datafusion/core/src/physical_plan/projection.rs b/datafusion/core/src/physical_plan/projection.rs index f2775079fc4a4..151b02ff4370b 100644 --- a/datafusion/core/src/physical_plan/projection.rs +++ b/datafusion/core/src/physical_plan/projection.rs @@ -223,6 +223,8 @@ impl ExecutionPlan for ProjectionExec { &self.columns_map, &mut new_properties, ); + println!("self schema:{:?}", self.schema); + println!("proj output ordering: {:?}", self.output_ordering); new_properties } diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/core/src/physical_plan/windows/mod.rs index e66beef6fc491..8be234ef32f50 100644 --- a/datafusion/core/src/physical_plan/windows/mod.rs +++ b/datafusion/core/src/physical_plan/windows/mod.rs @@ -252,6 +252,7 @@ pub(crate) fn window_ordering_equivalence( input: &Arc, window_expr: &[Arc], ) -> OrderingEquivalenceProperties { + println!("window_ordering_equivalence:"); // We need to update the schema, so we can not directly use // `input.ordering_equivalence_properties()`. let mut result = OrderingEquivalenceProperties::new(schema.clone()); diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index f9185837c18fd..9f2390cff5b79 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -324,6 +324,7 @@ pub fn project_ordering_equivalence_properties( columns_map: &HashMap>, output_eq: &mut OrderingEquivalenceProperties, ) { + println!("project_ordering_equivalence_properties is called"); let mut ec_classes = input_eq.classes().to_vec(); for class in ec_classes.iter_mut() { class.update_with_aliases(columns_map); From 83426412db23ddf15398e3e909fe4a4ba984e9ab Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 16:00:06 +0300 Subject: [PATCH 08/23] fix bug --- .../physical_optimizer/sort_enforcement.rs | 132 +----------------- .../core/src/physical_plan/projection.rs | 1 + .../core/src/physical_plan/windows/mod.rs | 1 + datafusion/physical-expr/src/equivalence.rs | 2 +- datafusion/physical-expr/src/utils.rs | 1 - 5 files changed, 7 insertions(+), 130 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 6b34b575e1126..2414d65551d74 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -49,7 +49,9 @@ use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use crate::physical_plan::windows::{ BoundedWindowAggExec, PartitionSearchMode, WindowAggExec, }; -use crate::physical_plan::{with_new_children_if_necessary, Distribution, ExecutionPlan, displayable}; +use crate::physical_plan::{ + displayable, with_new_children_if_necessary, Distribution, ExecutionPlan, +}; use arrow::datatypes::SchemaRef; use datafusion_common::tree_node::{Transformed, TreeNode, VisitRecursion}; use datafusion_common::utils::{get_at_indices, longest_consecutive_prefix}; @@ -444,7 +446,7 @@ fn parallelize_sorts( })) } -fn print_plan(plan: &Arc) -> Result<()>{ +fn print_plan(plan: &Arc) -> Result<()> { let formatted = displayable(plan.as_ref()).indent().to_string(); let actual: Vec<&str> = formatted.trim().lines().collect(); println!("{:#?}", actual); @@ -2905,129 +2907,3 @@ mod tests { ) } } - - -mod tmp_tests{ - use tempfile::TempDir; - use datafusion_common::Result; - use datafusion_execution::config::SessionConfig; - use crate::assert_batches_eq; - use crate::physical_plan::{collect, displayable}; - use crate::prelude::SessionContext; - - #[tokio::test] - async fn test_source_rn_ordered() -> Result<()> { - let config = SessionConfig::new() - .with_target_partitions(1); - let ctx = SessionContext::with_config(config); - ctx.sql("CREATE UNBOUNDED EXTERNAL TABLE annotated_data_infinite ( - ts INTEGER, - inc_col INTEGER, - desc_col INTEGER, - ) - STORED AS CSV - WITH HEADER ROW - WITH ORDER (ts ASC) - LOCATION 'tests/data/window_1.csv'").await?; - - let sql = "SELECT ts, rn1 FROM (SELECT ts, inc_col, - ROW_NUMBER() OVER() as rn1 - FROM annotated_data_infinite - ORDER BY ts ASC) - ORDER BY rn1 - LIMIT 5"; - - let msg = format!("Creating logical plan for '{sql}'"); - let dataframe = ctx.sql(sql).await.expect(&msg); - let physical_plan = dataframe.create_physical_plan().await?; - let formatted = displayable(physical_plan.as_ref()).indent().to_string(); - let expected = { - vec![ - "GlobalLimitExec: skip=0, fetch=5", - " ProjectionExec: expr=[ts@0 as ts, ROW_NUMBER() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1]", - " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }], mode=[Sorted]", - ] - }; - - let actual: Vec<&str> = formatted.trim().lines().collect(); - let actual_len = actual.len(); - let actual_trim_last = &actual[..actual_len - 1]; - assert_eq!( - expected, actual_trim_last, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" - ); - let actual = collect(physical_plan, ctx.task_ctx()).await?; - let expected = vec![ - "+----+-----+", - "| ts | rn1 |", - "+----+-----+", - "| 1 | 1 |", - "| 1 | 2 |", - "| 5 | 3 |", - "| 9 | 4 |", - "| 10 | 5 |", - "+----+-----+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) - } - - #[tokio::test] - async fn test_source_rn_ordered2() -> Result<()> { - let config = SessionConfig::new() - .with_target_partitions(1); - let ctx = SessionContext::with_config(config); - ctx.sql("CREATE EXTERNAL TABLE annotated_data_finite ( - ts INTEGER, - inc_col INTEGER, - desc_col INTEGER, - ) - STORED AS CSV - WITH HEADER ROW - WITH ORDER (ts ASC) - LOCATION 'tests/data/window_1.csv'").await?; - - let sql = "SELECT ts, rn1 FROM (SELECT ts, inc_col, - ROW_NUMBER() OVER(ORDER BY ts DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) as rn1 - FROM annotated_data_finite - ORDER BY ts DESC) - ORDER BY rn1 ASC - LIMIT 5"; - - let msg = format!("Creating logical plan for '{sql}'"); - let dataframe = ctx.sql(sql).await.expect(&msg); - let physical_plan = dataframe.create_physical_plan().await?; - let formatted = displayable(physical_plan.as_ref()).indent().to_string(); - let expected = { - vec![ - "GlobalLimitExec: skip=0, fetch=5", - " ProjectionExec: expr=[ts@0 as ts, ROW_NUMBER() ORDER BY [annotated_data.ts DESC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@1 as rn1]", - " BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: \"ROW_NUMBER()\", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(1)) }], mode=[Sorted]", - " SortExec: expr=[ts@0 DESC]", - ] - }; - - let actual: Vec<&str> = formatted.trim().lines().collect(); - let actual_len = actual.len(); - let actual_trim_last = &actual[..actual_len - 1]; - assert_eq!( - expected, actual_trim_last, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" - ); - - let actual = collect(physical_plan, ctx.task_ctx()).await?; - let expected = vec![ - "+-----+-----+", - "| ts | rn1 |", - "+-----+-----+", - "| 264 | 1 |", - "| 264 | 2 |", - "| 262 | 3 |", - "| 258 | 4 |", - "| 254 | 5 |", - "+-----+-----+", - ]; - assert_batches_eq!(expected, &actual); - Ok(()) - } -} diff --git a/datafusion/core/src/physical_plan/projection.rs b/datafusion/core/src/physical_plan/projection.rs index 151b02ff4370b..b0af8558aecbe 100644 --- a/datafusion/core/src/physical_plan/projection.rs +++ b/datafusion/core/src/physical_plan/projection.rs @@ -223,6 +223,7 @@ impl ExecutionPlan for ProjectionExec { &self.columns_map, &mut new_properties, ); + println!("new properties: {:?}", new_properties); println!("self schema:{:?}", self.schema); println!("proj output ordering: {:?}", self.output_ordering); new_properties diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/core/src/physical_plan/windows/mod.rs index 8be234ef32f50..88ec0c6235ad1 100644 --- a/datafusion/core/src/physical_plan/windows/mod.rs +++ b/datafusion/core/src/physical_plan/windows/mod.rs @@ -308,6 +308,7 @@ pub(crate) fn window_ordering_equivalence( } } } + println!("window result:{:?}", result); result } #[cfg(test)] diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 9f2390cff5b79..4d8adf6de2d7c 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -249,7 +249,7 @@ impl OrderingEquivalentClass { for col in columns { normalized[idx] = OrderedColumn { col: col.clone(), - options: self.head[idx].options, + options: elem.options, }; self.insert(normalized.clone()); } diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 1fcc4a19acfdc..94ae822672824 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -174,7 +174,6 @@ fn normalize_sort_expr_with_equivalence_properties( sort_expr } - fn normalize_sort_requirement_with_equivalence_properties( mut sort_requirement: PhysicalSortRequirement, eq_properties: &[EquivalentClass], From 6e6231c4f3197b7f2423620f37d3f76957b4e905 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 16:14:06 +0300 Subject: [PATCH 09/23] simplifications --- datafusion/physical-expr/src/equivalence.rs | 16 ++--- datafusion/physical-expr/src/utils.rs | 67 ++++++--------------- 2 files changed, 27 insertions(+), 56 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 4d8adf6de2d7c..ebf8940127846 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -200,20 +200,20 @@ impl OrderedColumn { } } -impl Into for OrderedColumn { - fn into(self) -> PhysicalSortExpr { +impl From for PhysicalSortExpr { + fn from(value: OrderedColumn) -> Self { PhysicalSortExpr { - expr: Arc::new(self.col) as _, - options: self.options, + expr: Arc::new(value.col) as _, + options: value.options, } } } -impl Into for OrderedColumn { - fn into(self) -> PhysicalSortRequirement { +impl From for PhysicalSortRequirement { + fn from(value: OrderedColumn) -> Self { PhysicalSortRequirement { - expr: Arc::new(self.col) as _, - options: Some(self.options), + expr: Arc::new(value.col) as _, + options: Some(value.options), } } } diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 94ae822672824..5aca4f677b6f2 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -16,14 +16,13 @@ // under the License. use crate::equivalence::{ - EquivalenceProperties, EquivalentClass, OrderedColumn, OrderingEquivalenceProperties, + EquivalenceProperties, EquivalentClass, OrderingEquivalenceProperties, OrderingEquivalentClass, }; use crate::expressions::{BinaryExpr, Column, UnKnownColumn}; use crate::{PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement}; use arrow::datatypes::SchemaRef; -use arrow_schema::SortOptions; use datafusion_common::tree_node::{ Transformed, TreeNode, TreeNodeRewriter, VisitRecursion, }; @@ -366,14 +365,10 @@ pub fn ordering_satisfy_concrete< let ordering_eq_classes = oeq_properties.classes(); let eq_properties = equal_properties(); let eq_classes = eq_properties.classes(); - let mut required_normalized = + let required_normalized = normalize_sort_expr2(required, eq_classes, ordering_eq_classes); - // TODO: Add collapse procedure - let mut provided_normalized = + let provided_normalized = normalize_sort_expr2(provided, eq_classes, ordering_eq_classes); - // TODO: Add collapse procedure - println!("required_normalized: {:?}", required_normalized); - println!("provided_normalized: {:?}", provided_normalized); if required_normalized.len() > provided_normalized.len() { return false; } @@ -421,9 +416,9 @@ pub fn ordering_satisfy_requirement_concrete< let ordering_eq_classes = oeq_properties.classes(); let eq_properties = equal_properties(); let eq_classes = eq_properties.classes(); - let mut required_normalized = + let required_normalized = normalize_sort_requirements2(required, eq_classes, ordering_eq_classes); - let mut provided_normalized = + let provided_normalized = normalize_sort_expr2(provided, eq_classes, ordering_eq_classes); if required_normalized.len() > provided_normalized.len() { return false; @@ -739,7 +734,7 @@ pub fn reassign_predicate_columns( mod tests { use super::*; use crate::expressions::{binary, cast, col, in_list, lit, Column, Literal}; - use crate::PhysicalSortExpr; + use crate::{OrderedColumn, PhysicalSortExpr}; use arrow::compute::SortOptions; use datafusion_common::{Result, ScalarValue}; use std::fmt::{Display, Formatter}; @@ -1312,30 +1307,26 @@ mod tests { #[test] fn test_normalize_expr_with_equivalence() -> Result<()> { let col_a = &Column::new("a", 0); - let _col_b = &Column::new("b", 1); + let col_b = &Column::new("b", 1); let col_c = &Column::new("c", 2); - let col_d = &Column::new("d", 3); - let col_e = &Column::new("e", 4); - let option1 = SortOptions { - descending: false, - nulls_first: false, - }; - let option2 = SortOptions { - descending: true, - nulls_first: true, - }; - // Assume schema satisfies ordering a ASC NULLS LAST - // and d ASC NULLS LAST and e DESC NULLS FIRST + let _col_d = &Column::new("d", 3); + let _col_e = &Column::new("e", 4); // Assume that column a and c are aliases. - let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; + let (_test_schema, eq_properties, _ordering_eq_properties) = + create_test_params()?; let col_a_expr = Arc::new(col_a.clone()) as Arc; + let col_b_expr = Arc::new(col_b.clone()) as Arc; let col_c_expr = Arc::new(col_c.clone()) as Arc; - let col_d_expr = Arc::new(col_d.clone()) as Arc; - let col_e_expr = Arc::new(col_e.clone()) as Arc; // Test cases for equivalence normalization, // First entry in the tuple is argument, second entry is expected result after normalization. - let expressions = vec![(&col_a_expr, &col_a_expr), (&col_c_expr, &col_a_expr)]; + let expressions = vec![ + // Normalized version of the column a and c should go to a (since a is head) + (&col_a_expr, &col_a_expr), + (&col_c_expr, &col_a_expr), + // Cannot normalize column b + (&col_b_expr, &col_b_expr), + ]; for (expr, expected_eq) in expressions { assert!( expected_eq.eq(&normalize_expr_with_equivalence_properties( @@ -1346,26 +1337,6 @@ mod tests { ); } - // // Test cases for ordering equivalence normalization - // // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. - // let expressions = vec![ - // (&col_d_expr, option1, option1, &col_a_expr), - // (&col_e_expr, option2, option1, &col_a_expr), - // // Cannot normalize, hence should return itself. - // (&col_e_expr, option1, option1, &col_e_expr), - // ]; - // for (expr, sort_options, expected_options, expected_ordering_eq) in expressions { - // let (normalized_expr, options) = - // normalize_expr_with_ordering_equivalence_properties( - // expr.clone(), - // sort_options, - // ordering_eq_properties.classes(), - // ); - // assert!( - // normalized_expr.eq(expected_ordering_eq) && (expected_options == options), - // "error in test: expr: {expr:?}, sort_options: {sort_options:?}" - // ); - // } Ok(()) } From 7d2ecdd35f8920348ad3306ac624a34e8d8f0785 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 16:21:55 +0300 Subject: [PATCH 10/23] Resolve linter errors --- datafusion/physical-expr/src/utils.rs | 49 +++++---------------------- 1 file changed, 8 insertions(+), 41 deletions(-) diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 5aca4f677b6f2..f74d891b4845b 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -468,9 +468,9 @@ fn requirements_compatible_concrete< let eq_properties = equal_properties(); let eq_classes = eq_properties.classes(); - let mut required_normalized = + let required_normalized = normalize_sort_requirements2(required, eq_classes, ordering_eq_classes); - let mut provided_normalized = + let provided_normalized = normalize_sort_requirements2(provided, eq_classes, ordering_eq_classes); if required_normalized.len() > provided_normalized.len() { return false; @@ -1346,19 +1346,14 @@ mod tests { let _col_b = &Column::new("b", 1); let col_c = &Column::new("c", 2); let col_d = &Column::new("d", 3); - let col_e = &Column::new("e", 4); + let _col_e = &Column::new("e", 4); let option1 = SortOptions { descending: false, nulls_first: false, }; - let option2 = SortOptions { - descending: true, - nulls_first: true, - }; - // Assume schema satisfies ordering a ASC NULLS LAST - // and d ASC NULLS LAST and e DESC NULLS FIRST // Assume that column a and c are aliases. - let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; + let (_test_schema, eq_properties, _ordering_eq_properties) = + create_test_params()?; // Test cases for equivalence normalization // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. @@ -1388,31 +1383,6 @@ mod tests { ); } - // // Test cases for ordering equivalence normalization - // // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. - // let expressions = vec![ - // (&col_d, option1, &col_a, option1), - // (&col_e, option2, &col_a, option1), - // ]; - // for (expr, sort_options, expected_col, expected_options) in - // expressions.into_iter() - // { - // let expected = PhysicalSortExpr { - // expr: Arc::new((*expected_col).clone()) as _, - // options: expected_options, - // }; - // let arg = PhysicalSortExpr { - // expr: Arc::new((*expr).clone()) as _, - // options: sort_options, - // }; - // assert!( - // expected.eq(&normalize_sort_expr_with_ordering_equivalence_properties( - // arg.clone(), - // ordering_eq_properties.classes() - // )), - // "error in test: expr: {expr:?}, sort_options: {sort_options:?}" - // ); - // } Ok(()) } @@ -1422,19 +1392,16 @@ mod tests { let _col_b = &Column::new("b", 1); let col_c = &Column::new("c", 2); let col_d = &Column::new("d", 3); - let col_e = &Column::new("e", 4); + let _col_e = &Column::new("e", 4); let option1 = SortOptions { descending: false, nulls_first: false, }; - let option2 = SortOptions { - descending: true, - nulls_first: true, - }; // Assume schema satisfies ordering a ASC NULLS LAST // and d ASC NULLS LAST and e DESC NULLS FIRST // Assume that column a and c are aliases. - let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; + let (_test_schema, eq_properties, _ordering_eq_properties) = + create_test_params()?; // Test cases for equivalence normalization // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. From c65d6001cf229d15b46e5887b4d9bbeade70d536 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 16:26:47 +0300 Subject: [PATCH 11/23] remove unnecessary codes --- .../src/physical_optimizer/sort_enforcement.rs | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/sort_enforcement.rs b/datafusion/core/src/physical_optimizer/sort_enforcement.rs index 2414d65551d74..f71c79e9fc829 100644 --- a/datafusion/core/src/physical_optimizer/sort_enforcement.rs +++ b/datafusion/core/src/physical_optimizer/sort_enforcement.rs @@ -49,9 +49,7 @@ use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use crate::physical_plan::windows::{ BoundedWindowAggExec, PartitionSearchMode, WindowAggExec, }; -use crate::physical_plan::{ - displayable, with_new_children_if_necessary, Distribution, ExecutionPlan, -}; +use crate::physical_plan::{with_new_children_if_necessary, Distribution, ExecutionPlan}; use arrow::datatypes::SchemaRef; use datafusion_common::tree_node::{Transformed, TreeNode, VisitRecursion}; use datafusion_common::utils::{get_at_indices, longest_consecutive_prefix}; @@ -446,13 +444,6 @@ fn parallelize_sorts( })) } -fn print_plan(plan: &Arc) -> Result<()> { - let formatted = displayable(plan.as_ref()).indent().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - println!("{:#?}", actual); - Ok(()) -} - /// This function enforces sorting requirements and makes optimizations without /// violating these requirements whenever possible. fn ensure_sorting( @@ -463,10 +454,6 @@ fn ensure_sorting( return Ok(Transformed::No(requirements)); } let plan = requirements.plan; - println!("--------------------------------------"); - print_plan(&plan)?; - println!("plan.output ordering:{:?}", plan.output_ordering()); - println!("--------------------------------------"); let mut children = plan.children(); let mut sort_onwards = requirements.sort_onwards; if let Some(result) = analyze_immediate_sort_removal(&plan, &sort_onwards) { From 45fb9c57e54f972b34fb006130aad1e0863e7da0 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 16:28:30 +0300 Subject: [PATCH 12/23] simplifications --- datafusion/core/src/physical_plan/projection.rs | 3 --- datafusion/core/src/physical_plan/windows/mod.rs | 2 -- 2 files changed, 5 deletions(-) diff --git a/datafusion/core/src/physical_plan/projection.rs b/datafusion/core/src/physical_plan/projection.rs index b0af8558aecbe..f2775079fc4a4 100644 --- a/datafusion/core/src/physical_plan/projection.rs +++ b/datafusion/core/src/physical_plan/projection.rs @@ -223,9 +223,6 @@ impl ExecutionPlan for ProjectionExec { &self.columns_map, &mut new_properties, ); - println!("new properties: {:?}", new_properties); - println!("self schema:{:?}", self.schema); - println!("proj output ordering: {:?}", self.output_ordering); new_properties } diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/core/src/physical_plan/windows/mod.rs index 88ec0c6235ad1..e66beef6fc491 100644 --- a/datafusion/core/src/physical_plan/windows/mod.rs +++ b/datafusion/core/src/physical_plan/windows/mod.rs @@ -252,7 +252,6 @@ pub(crate) fn window_ordering_equivalence( input: &Arc, window_expr: &[Arc], ) -> OrderingEquivalenceProperties { - println!("window_ordering_equivalence:"); // We need to update the schema, so we can not directly use // `input.ordering_equivalence_properties()`. let mut result = OrderingEquivalenceProperties::new(schema.clone()); @@ -308,7 +307,6 @@ pub(crate) fn window_ordering_equivalence( } } } - println!("window result:{:?}", result); result } #[cfg(test)] From 91fc43c1ea6c0455fd9f3f85f21195cab8d13434 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 16:47:24 +0300 Subject: [PATCH 13/23] simplifications --- datafusion/physical-expr/src/equivalence.rs | 22 ---- datafusion/physical-expr/src/utils.rs | 138 +++----------------- 2 files changed, 16 insertions(+), 144 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index ebf8940127846..9a45475752b66 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -291,27 +291,6 @@ pub fn project_equivalence_properties( output_eq.extend(ec_classes); } -// /// This function applies the given projection to the given ordering -// /// equivalence properties to compute the resulting (projected) ordering -// /// equivalence properties; e.g. -// /// 1) Adding an alias, which can introduce additional ordering equivalence -// /// properties, as in Projection(a, a as a1, a as a2) extends global ordering -// /// of a to a1 and a2. -// /// 2) Truncate the [`OrderingEquivalentClass`]es that are not in the output schema. -// pub fn project_ordering_equivalence_properties( -// input_eq: OrderingEquivalenceProperties, -// columns_map: &HashMap>, -// output_eq: &mut OrderingEquivalenceProperties, -// ) { -// let mut ec_classes = input_eq.classes().to_vec(); -// for class in ec_classes.iter_mut() { -// class.update_with_aliases(columns_map); -// } -// -// prune_columns_to_remove(output_eq, &mut ec_classes); -// output_eq.extend(ec_classes); -// } - /// This function applies the given projection to the given ordering /// equivalence properties to compute the resulting (projected) ordering /// equivalence properties; e.g. @@ -324,7 +303,6 @@ pub fn project_ordering_equivalence_properties( columns_map: &HashMap>, output_eq: &mut OrderingEquivalenceProperties, ) { - println!("project_ordering_equivalence_properties is called"); let mut ec_classes = input_eq.classes().to_vec(); for class in ec_classes.iter_mut() { class.update_with_aliases(columns_map); diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index f74d891b4845b..302ed72cd536f 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -209,59 +209,22 @@ fn collapse_vec(in_data: Vec) -> Vec { out_data } -pub fn normalize_sort_expr2( +pub fn normalize_sort_exprs( sort_exprs: &[PhysicalSortExpr], eq_properties: &[EquivalentClass], ordering_eq_properties: &[OrderingEquivalentClass], ) -> Vec { - let mut normalized_exprs = sort_exprs - .iter() - .map(|sort_expr| { - normalize_sort_expr_with_equivalence_properties( - sort_expr.clone(), - eq_properties, - ) - }) - .collect::>(); - for ordering_eq_class in ordering_eq_properties { - for elem in ordering_eq_class.others() { - let elem: Vec = elem - .clone() - .into_iter() - .map(|elem| elem.into()) - .collect::>(); - println!("normalized_exprs: {:?}", normalized_exprs); - println!("elem: {:?}", elem); - let ranges = get_ranges_inside(&normalized_exprs, &elem); - let mut offset: i64 = 0; - for Range { start, end } in ranges { - println!("start:{:?}, end:{:?}", start, end); - let head: Vec = ordering_eq_class - .head() - .clone() - .into_iter() - .map(|elem| elem.into()) - .collect::>(); - println!("head:{:?}", head); - let updated_start: i64 = start as i64 + offset; - let updated_end: i64 = end as i64 + offset; - println!( - "updated_start: {:?}, updated_end:{:?}", - updated_start, updated_end - ); - let range = end - start; - offset += head.len() as i64 - range as i64; - normalized_exprs - .splice(updated_start as usize..updated_end as usize, head); - println!("normalized_exprs bef return:{:?}", normalized_exprs); - // break; - } - } - } + let sort_requirements = PhysicalSortRequirement::from_sort_exprs(sort_exprs.iter()); + let normalized_exprs = normalize_sort_requirements( + &sort_requirements, + eq_properties, + ordering_eq_properties, + ); + let normalized_exprs = PhysicalSortRequirement::to_sort_exprs(normalized_exprs); collapse_vec(normalized_exprs) } -pub fn normalize_sort_requirements2( +pub fn normalize_sort_requirements( sort_exprs: &[PhysicalSortRequirement], eq_properties: &[EquivalentClass], ordering_eq_properties: &[OrderingEquivalentClass], @@ -282,52 +245,27 @@ pub fn normalize_sort_requirements2( .into_iter() .map(|elem| elem.into()) .collect::>(); - println!("normalized_exprs: {:?}", normalized_exprs); - println!("elem: {:?}", elem); let ranges = get_ranges_inside(&normalized_exprs, &elem); let mut offset: i64 = 0; for Range { start, end } in ranges { - println!("start:{:?}, end:{:?}", start, end); let head: Vec = ordering_eq_class .head() .clone() .into_iter() .map(|elem| elem.into()) .collect::>(); - println!("head:{:?}", head); let updated_start: i64 = start as i64 + offset; let updated_end: i64 = end as i64 + offset; - println!( - "updated_start: {:?}, updated_end:{:?}", - updated_start, updated_end - ); let range = end - start; offset += head.len() as i64 - range as i64; normalized_exprs .splice(updated_start as usize..updated_end as usize, head); - println!("normalized_exprs bef return:{:?}", normalized_exprs); - // break; } } } collapse_vec(normalized_exprs) } -// pub fn normalize_sort_requirement( -// sort_requirement: PhysicalSortRequirement, -// eq_properties: &[EquivalentClass], -// ordering_eq_properties: &[OrderingEquivalentClass], -// ) -> PhysicalSortRequirement { -// let normalized = normalize_sort_requirement_with_equivalence_properties( -// sort_requirement, -// eq_properties, -// ); -// normalize_sort_requirement_with_ordering_equivalence_properties( -// normalized, -// ordering_eq_properties, -// ) -// } - /// Checks whether given ordering requirements are satisfied by provided [PhysicalSortExpr]s. pub fn ordering_satisfy< F: FnOnce() -> EquivalenceProperties, @@ -366,9 +304,9 @@ pub fn ordering_satisfy_concrete< let eq_properties = equal_properties(); let eq_classes = eq_properties.classes(); let required_normalized = - normalize_sort_expr2(required, eq_classes, ordering_eq_classes); + normalize_sort_exprs(required, eq_classes, ordering_eq_classes); let provided_normalized = - normalize_sort_expr2(provided, eq_classes, ordering_eq_classes); + normalize_sort_exprs(provided, eq_classes, ordering_eq_classes); if required_normalized.len() > provided_normalized.len() { return false; } @@ -417,9 +355,9 @@ pub fn ordering_satisfy_requirement_concrete< let eq_properties = equal_properties(); let eq_classes = eq_properties.classes(); let required_normalized = - normalize_sort_requirements2(required, eq_classes, ordering_eq_classes); + normalize_sort_requirements(required, eq_classes, ordering_eq_classes); let provided_normalized = - normalize_sort_expr2(provided, eq_classes, ordering_eq_classes); + normalize_sort_exprs(provided, eq_classes, ordering_eq_classes); if required_normalized.len() > provided_normalized.len() { return false; } @@ -469,9 +407,9 @@ fn requirements_compatible_concrete< let eq_classes = eq_properties.classes(); let required_normalized = - normalize_sort_requirements2(required, eq_classes, ordering_eq_classes); + normalize_sort_requirements(required, eq_classes, ordering_eq_classes); let provided_normalized = - normalize_sort_requirements2(provided, eq_classes, ordering_eq_classes); + normalize_sort_requirements(provided, eq_classes, ordering_eq_classes); if required_normalized.len() > provided_normalized.len() { return false; } @@ -799,43 +737,6 @@ mod tests { SchemaRef, EquivalenceProperties, OrderingEquivalenceProperties, - )> { - // Assume schema satisfies ordering a ASC NULLS LAST - // and d ASC NULLS LAST and e DESC NULLS FIRST - // Assume that column a and c are aliases. - let col_a = &Column::new("a", 0); - let _col_b = &Column::new("b", 1); - let col_c = &Column::new("c", 2); - let col_d = &Column::new("d", 3); - let col_e = &Column::new("e", 4); - let option1 = SortOptions { - descending: false, - nulls_first: false, - }; - let option2 = SortOptions { - descending: true, - nulls_first: true, - }; - let test_schema = create_test_schema()?; - let mut eq_properties = EquivalenceProperties::new(test_schema.clone()); - eq_properties.add_equal_conditions((col_a, col_c)); - let mut ordering_eq_properties = - OrderingEquivalenceProperties::new(test_schema.clone()); - ordering_eq_properties.add_equal_conditions(( - &vec![OrderedColumn::new(col_a.clone(), option1)], - &vec![OrderedColumn::new(col_d.clone(), option1)], - )); - ordering_eq_properties.add_equal_conditions(( - &vec![OrderedColumn::new(col_a.clone(), option1)], - &vec![OrderedColumn::new(col_e.clone(), option2)], - )); - Ok((test_schema, eq_properties, ordering_eq_properties)) - } - - fn create_test_params2() -> Result<( - SchemaRef, - EquivalenceProperties, - OrderingEquivalenceProperties, )> { // Assume schema satisfies ordering a ASC NULLS LAST // and d ASC NULLS LAST, b ASC NULLS LAST and e DESC NULLS FIRST, b ASC NULLS LAST @@ -1185,8 +1086,7 @@ mod tests { }, ]; let provided = Some(&provided[..]); - let (_test_schema, eq_properties, ordering_eq_properties) = - create_test_params2()?; + let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function let requirements = vec![ // `a ASC NULLS LAST`, expects `ordering_satisfy` to be `true`, since existing ordering `a ASC NULLS LAST, b ASC NULLS LAST` satisfies it @@ -1237,10 +1137,6 @@ mod tests { ], false, ), - // (vec![(col_d, option1)], true), - // (vec![(col_d, option2)], false), - // (vec![(col_e, option2)], true), - // (vec![(col_e, option1)], false), ]; for (cols, expected) in requirements { let err_msg = format!("Error in test case:{cols:?}"); @@ -1397,8 +1293,6 @@ mod tests { descending: false, nulls_first: false, }; - // Assume schema satisfies ordering a ASC NULLS LAST - // and d ASC NULLS LAST and e DESC NULLS FIRST // Assume that column a and c are aliases. let (_test_schema, eq_properties, _ordering_eq_properties) = create_test_params()?; From 4468a8e0e5d24418c72f3ac08c52bf70117e30a9 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 16:53:27 +0300 Subject: [PATCH 14/23] Remove unnecessary codes --- datafusion/physical-expr/src/utils.rs | 57 +-------------------------- 1 file changed, 2 insertions(+), 55 deletions(-) diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 302ed72cd536f..85aad74e531bf 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -164,15 +164,6 @@ pub fn normalize_expr_with_equivalence_properties( .unwrap_or(expr) } -fn normalize_sort_expr_with_equivalence_properties( - mut sort_expr: PhysicalSortExpr, - eq_properties: &[EquivalentClass], -) -> PhysicalSortExpr { - sort_expr.expr = - normalize_expr_with_equivalence_properties(sort_expr.expr, eq_properties); - sort_expr -} - fn normalize_sort_requirement_with_equivalence_properties( mut sort_requirement: PhysicalSortRequirement, eq_properties: &[EquivalentClass], @@ -182,6 +173,7 @@ fn normalize_sort_requirement_with_equivalence_properties( sort_requirement } +// Searches `section` inside the `to_search`. Returns each range `section` found inside the `to_search`. fn get_ranges_inside(to_search: &[T], section: &[T]) -> Vec> { let n_section = section.len(); let n_end = if to_search.len() >= n_section { @@ -199,6 +191,7 @@ fn get_ranges_inside(to_search: &[T], section: &[T]) -> Vec(in_data: Vec) -> Vec { let mut out_data = vec![]; for elem in in_data { @@ -1236,52 +1229,6 @@ mod tests { Ok(()) } - #[test] - fn test_normalize_sort_expr_with_equivalence() -> Result<()> { - let col_a = &Column::new("a", 0); - let _col_b = &Column::new("b", 1); - let col_c = &Column::new("c", 2); - let col_d = &Column::new("d", 3); - let _col_e = &Column::new("e", 4); - let option1 = SortOptions { - descending: false, - nulls_first: false, - }; - // Assume that column a and c are aliases. - let (_test_schema, eq_properties, _ordering_eq_properties) = - create_test_params()?; - - // Test cases for equivalence normalization - // First entry in the tuple is PhysicalExpr, second entry is its ordering, third entry is result after normalization. - let expressions = vec![ - (&col_a, option1, &col_a, option1), - (&col_c, option1, &col_a, option1), - // Cannot normalize column d, since it is not in equivalence properties. - (&col_d, option1, &col_d, option1), - ]; - for (expr, sort_options, expected_col, expected_options) in - expressions.into_iter() - { - let expected = PhysicalSortExpr { - expr: Arc::new((*expected_col).clone()) as _, - options: expected_options, - }; - let arg = PhysicalSortExpr { - expr: Arc::new((*expr).clone()) as _, - options: sort_options, - }; - assert!( - expected.eq(&normalize_sort_expr_with_equivalence_properties( - arg.clone(), - eq_properties.classes() - )), - "error in test: expr: {expr:?}, sort_options: {sort_options:?}" - ); - } - - Ok(()) - } - #[test] fn test_normalize_sort_requirement_with_equivalence() -> Result<()> { let col_a = &Column::new("a", 0); From 9b4bace9b705391c138963704cef65768a7df306 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 17:18:08 +0300 Subject: [PATCH 15/23] Add pruning to ordering_equivalence projection --- datafusion/physical-expr/src/equivalence.rs | 70 ++++++++++----------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 9a45475752b66..5d634787da1af 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -218,22 +218,6 @@ impl From for PhysicalSortRequirement { } } -trait ColumnAccessor { - fn column(&self) -> &Column; -} - -impl ColumnAccessor for Column { - fn column(&self) -> &Column { - self - } -} - -impl ColumnAccessor for OrderedColumn { - fn column(&self) -> &Column { - &self.col - } -} - pub type OrderingEquivalentClass = EquivalentClass>; impl OrderingEquivalentClass { @@ -270,10 +254,10 @@ pub fn project_equivalence_properties( alias_map: &HashMap>, output_eq: &mut EquivalenceProperties, ) { - let mut ec_classes = input_eq.classes().to_vec(); + let mut eq_classes = input_eq.classes().to_vec(); for (column, columns) in alias_map { let mut find_match = false; - for class in ec_classes.iter_mut() { + for class in eq_classes.iter_mut() { if class.contains(column) { for col in columns { class.insert(col.clone()); @@ -283,12 +267,29 @@ pub fn project_equivalence_properties( } } if !find_match { - ec_classes.push(EquivalentClass::new(column.clone(), columns.clone())); + eq_classes.push(EquivalentClass::new(column.clone(), columns.clone())); + } + } + + // Prune columns that no longer is in the schema from from the EquivalenceProperties. + let schema = output_eq.schema(); + let fields = schema.fields(); + for class in eq_classes.iter_mut() { + let columns_to_remove = class + .iter() + .filter(|column| { + let idx = column.index(); + idx >= fields.len() || fields[idx].name() != column.name() + }) + .cloned() + .collect::>(); + for column in columns_to_remove { + class.remove(&column); } } + eq_classes.retain(|props| props.len() > 1); - prune_columns_to_remove(output_eq, &mut ec_classes); - output_eq.extend(ec_classes); + output_eq.extend(eq_classes); } /// This function applies the given projection to the given ordering @@ -303,29 +304,22 @@ pub fn project_ordering_equivalence_properties( columns_map: &HashMap>, output_eq: &mut OrderingEquivalenceProperties, ) { - let mut ec_classes = input_eq.classes().to_vec(); - for class in ec_classes.iter_mut() { + let mut eq_classes = input_eq.classes().to_vec(); + for class in eq_classes.iter_mut() { class.update_with_aliases(columns_map); } - // prune_columns_to_remove(output_eq, &mut ec_classes); - // TODO: Add pruning - output_eq.extend(ec_classes); -} - -fn prune_columns_to_remove( - eq_properties: &EquivalenceProperties, - eq_classes: &mut Vec>, -) { - let schema = eq_properties.schema(); + // Prune columns that no longer is in the schema from from the OrderingEquivalenceProperties. + let schema = output_eq.schema(); let fields = schema.fields(); for class in eq_classes.iter_mut() { let columns_to_remove = class .iter() - .filter(|elem| { - let column = elem.column(); - let idx = column.index(); - idx >= fields.len() || fields[idx].name() != column.name() + .filter(|columns| { + columns.iter().any(|column| { + let idx = column.col.index(); + idx >= fields.len() || fields[idx].name() != column.col.name() + }) }) .cloned() .collect::>(); @@ -334,6 +328,8 @@ fn prune_columns_to_remove( } } eq_classes.retain(|props| props.len() > 1); + + output_eq.extend(eq_classes); } #[cfg(test)] From 7a4f935f68c804bbb2c33730023abddc8e74b7d1 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 17:52:11 +0300 Subject: [PATCH 16/23] Remove unnecessary clones --- datafusion/physical-expr/src/equivalence.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 5d634787da1af..12506148b49bf 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -223,23 +223,25 @@ pub type OrderingEquivalentClass = EquivalentClass>; impl OrderingEquivalentClass { fn update_with_aliases(&mut self, columns_map: &HashMap>) { for (column, columns) in columns_map { - for ordering in vec![self.head.clone()] - .iter() - .chain(self.others.clone().iter()) - { + let mut to_insert = vec![]; + for ordering in vec![&self.head].into_iter().chain(self.others.iter()) { for (idx, elem) in ordering.iter().enumerate() { if elem.col.eq(column) { - let mut normalized = self.head.clone(); for col in columns { + let mut normalized = self.head.clone(); + // Change the corresponding entry in the head, with the alias column normalized[idx] = OrderedColumn { col: col.clone(), options: elem.options, }; - self.insert(normalized.clone()); + to_insert.push(normalized); } } } } + for elems in to_insert { + self.insert(elems); + } } } } From 89e7754d4bcdd6a2a264b55aee4a84636f2a0ce8 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 17 May 2023 18:27:46 +0300 Subject: [PATCH 17/23] Convert get range to calculate compatible ranges --- datafusion/physical-expr/src/utils.rs | 162 +++++++++++++++++++++----- 1 file changed, 132 insertions(+), 30 deletions(-) diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 85aad74e531bf..800d3e85320cc 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -173,18 +173,25 @@ fn normalize_sort_requirement_with_equivalence_properties( sort_requirement } -// Searches `section` inside the `to_search`. Returns each range `section` found inside the `to_search`. -fn get_ranges_inside(to_search: &[T], section: &[T]) -> Vec> { +// Searches `section` inside the `searched`. Returns each range where `section` is compatible with the corresponding slice in the `searched`. +fn get_compatible_ranges( + searched: &[PhysicalSortRequirement], + section: &[PhysicalSortRequirement], +) -> Vec> { let n_section = section.len(); - let n_end = if to_search.len() >= n_section { - to_search.len() - n_section + 1 + let n_end = if searched.len() >= n_section { + searched.len() - n_section + 1 } else { 0 }; let mut res = vec![]; for idx in 0..n_end { let end = idx + n_section; - if to_search[idx..end].eq(section) { + let matches = searched[idx..end] + .iter() + .zip(section) + .all(|(req, given)| given.compatible(req)); + if matches { res.push(Range { start: idx, end }); } } @@ -218,15 +225,15 @@ pub fn normalize_sort_exprs( } pub fn normalize_sort_requirements( - sort_exprs: &[PhysicalSortRequirement], + sort_reqs: &[PhysicalSortRequirement], eq_properties: &[EquivalentClass], ordering_eq_properties: &[OrderingEquivalentClass], ) -> Vec { - let mut normalized_exprs = sort_exprs + let mut normalized_exprs = sort_reqs .iter() - .map(|sort_expr| { + .map(|sort_req| { normalize_sort_requirement_with_equivalence_properties( - sort_expr.clone(), + sort_req.clone(), eq_properties, ) }) @@ -238,7 +245,7 @@ pub fn normalize_sort_requirements( .into_iter() .map(|elem| elem.into()) .collect::>(); - let ranges = get_ranges_inside(&normalized_exprs, &elem); + let ranges = get_compatible_ranges(&normalized_exprs, &elem); let mut offset: i64 = 0; for Range { start, end } in ranges { let head: Vec = ordering_eq_class @@ -247,12 +254,21 @@ pub fn normalize_sort_requirements( .into_iter() .map(|elem| elem.into()) .collect::>(); - let updated_start: i64 = start as i64 + offset; - let updated_end: i64 = end as i64 + offset; + let updated_start = (start as i64 + offset) as usize; + let updated_end = (end as i64 + offset) as usize; let range = end - start; offset += head.len() as i64 - range as i64; - normalized_exprs - .splice(updated_start as usize..updated_end as usize, head); + let all_none = normalized_exprs[updated_start..updated_end] + .iter() + .all(|elem| elem.options.is_none()); + let head = if all_none { + head.into_iter() + .map(|elem| PhysicalSortRequirement::new(elem.expr, None)) + .collect::>() + } else { + head + }; + normalized_exprs.splice(updated_start..updated_end, head); } } } @@ -1156,6 +1172,71 @@ mod tests { Ok(()) } + fn convert_to_requirement( + in_data: &[(&Column, Option)], + ) -> Vec { + in_data + .iter() + .map(|(col, options)| { + PhysicalSortRequirement::new(Arc::new((*col).clone()) as _, *options) + }) + .collect::>() + } + + #[test] + fn test_normalize_sort_reqs() -> Result<()> { + let col_a = &Column::new("a", 0); + let col_b = &Column::new("b", 1); + let col_c = &Column::new("c", 2); + let col_d = &Column::new("d", 3); + let col_e = &Column::new("e", 4); + let option1 = SortOptions { + descending: false, + nulls_first: false, + }; + let option2 = SortOptions { + descending: true, + nulls_first: true, + }; + // First element in the tuple stores vector of requirement, second element is the expected return value for ordering_satisfy function + let requirements = vec![ + (vec![(col_a, Some(option1))], vec![(col_a, Some(option1))]), + (vec![(col_a, None)], vec![(col_a, None)]), + // Test whether equivalence works as expected + (vec![(col_c, Some(option1))], vec![(col_a, Some(option1))]), + (vec![(col_c, None)], vec![(col_a, None)]), + // Test whether ordering equivalence works as expected + ( + vec![(col_d, Some(option1)), (col_b, Some(option1))], + vec![(col_a, Some(option1))], + ), + (vec![(col_d, None), (col_b, None)], vec![(col_a, None)]), + ( + vec![(col_e, Some(option2)), (col_b, Some(option1))], + vec![(col_a, Some(option1))], + ), + // We should be able to normalize in compatible requirements also (not exactly equal) + ( + vec![(col_e, Some(option2)), (col_b, None)], + vec![(col_a, Some(option1))], + ), + (vec![(col_e, None), (col_b, None)], vec![(col_a, None)]), + ]; + let (_test_schema, eq_properties, ordering_eq_properties) = create_test_params()?; + let eq_classes = eq_properties.classes(); + let ordering_eq_classes = ordering_eq_properties.classes(); + for (reqs, expected_normalized) in requirements.into_iter() { + let req = convert_to_requirement(&reqs); + let expected_normalized = convert_to_requirement(&expected_normalized); + + assert_eq!( + normalize_sort_requirements(&req, eq_classes, ordering_eq_classes), + expected_normalized + ); + } + Ok(()) + } + #[test] fn test_reassign_predicate_columns_in_list() { let int_field = Field::new("should_not_matter", DataType::Int64, true); @@ -1362,22 +1443,43 @@ mod tests { } #[test] - fn test_get_range_inside() -> Result<()> { - let empty_vec: Vec> = Vec::new(); - assert_eq!( - get_ranges_inside(&[1, 2, 3], &[1, 2]), - vec![Range { start: 0, end: 2 }] - ); - assert_eq!( - get_ranges_inside(&[1, 2, 3], &[2, 3]), - vec![Range { start: 1, end: 3 }] - ); - assert_eq!(get_ranges_inside(&[1, 2, 3], &[1, 3]), empty_vec); - assert_eq!( - get_ranges_inside(&[1, 2, 3], &[1, 2, 3]), - vec![Range { start: 0, end: 3 }] - ); - assert_eq!(get_ranges_inside(&[1, 2, 3], &[3, 2]), empty_vec); + fn test_get_compatible_ranges() -> Result<()> { + let col_a = &Column::new("a", 0); + let col_b = &Column::new("b", 1); + let option1 = SortOptions { + descending: false, + nulls_first: false, + }; + let test_data = vec![ + ( + vec![(col_a, Some(option1)), (col_b, Some(option1))], + vec![(col_a, Some(option1))], + vec![(0, 1)], + ), + ( + vec![(col_a, None), (col_b, Some(option1))], + vec![(col_a, Some(option1))], + vec![(0, 1)], + ), + ( + vec![ + (col_a, None), + (col_b, Some(option1)), + (col_a, Some(option1)), + ], + vec![(col_a, Some(option1))], + vec![(0, 1), (2, 3)], + ), + ]; + for (searched, to_search, expected) in test_data { + let searched = convert_to_requirement(&searched); + let to_search = convert_to_requirement(&to_search); + let expected = expected + .into_iter() + .map(|(start, end)| Range { start, end }) + .collect::>(); + assert_eq!(get_compatible_ranges(&searched, &to_search), expected); + } Ok(()) } From bee46cdb3f62dc986d94552500afacfae03d1001 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Mon, 22 May 2023 23:36:45 -0500 Subject: [PATCH 18/23] Simplifications --- .../core/src/physical_plan/windows/mod.rs | 36 +++++----- datafusion/physical-expr/src/equivalence.rs | 22 +++--- datafusion/physical-expr/src/utils.rs | 71 +++++++++---------- 3 files changed, 62 insertions(+), 67 deletions(-) diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/core/src/physical_plan/windows/mod.rs index e66beef6fc491..3f92edc4be9d8 100644 --- a/datafusion/core/src/physical_plan/windows/mod.rs +++ b/datafusion/core/src/physical_plan/windows/mod.rs @@ -262,18 +262,16 @@ pub(crate) fn window_ordering_equivalence( .iter() .cloned(), ); - let out_ordering = input.output_ordering().unwrap_or(&[]); - let mut out_ordering_normalized = vec![]; - for elem in out_ordering { - // Normalize expression, as we search for ordering equivalences - // on normalized versions: + let mut normalized_out_ordering = vec![]; + for item in input.output_ordering().unwrap_or(&[]) { + // To account for ordering equivalences, first normalize the expression: let normalized = normalize_expr_with_equivalence_properties( - elem.expr.clone(), + item.expr.clone(), input.equivalence_properties().classes(), ); if let Some(column) = normalized.as_any().downcast_ref::() { - out_ordering_normalized - .push(OrderedColumn::new(column.clone(), elem.options)); + normalized_out_ordering + .push(OrderedColumn::new(column.clone(), item.options)); } else { break; } @@ -290,18 +288,18 @@ pub(crate) fn window_ordering_equivalence( .is::() { // If there is an existing ordering, add new ordering as an equivalence: - if !out_ordering_normalized.is_empty() { - let options = SortOptions { - descending: false, - nulls_first: false, - }; // ASC, NULLS LAST - let column_info = - schema.column_with_name(expr.field().unwrap().name()); - if let Some((idx, field)) = column_info { - let rhs = - OrderedColumn::new(Column::new(field.name(), idx), options); + if !normalized_out_ordering.is_empty() { + if let Some((idx, field)) = + schema.column_with_name(expr.field().unwrap().name()) + { + let column = Column::new(field.name(), idx); + let options = SortOptions { + descending: false, + nulls_first: false, + }; // ASC, NULLS LAST + let rhs = OrderedColumn::new(column, options); result - .add_equal_conditions((&out_ordering_normalized, &vec![rhs])); + .add_equal_conditions((&normalized_out_ordering, &vec![rhs])); } } } diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 12506148b49bf..55062681690cb 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -16,11 +16,11 @@ // under the License. use crate::expressions::Column; +use crate::{PhysicalSortExpr, PhysicalSortRequirement}; use arrow::datatypes::SchemaRef; use arrow_schema::SortOptions; -use crate::{PhysicalSortExpr, PhysicalSortRequirement}; use std::collections::{HashMap, HashSet}; use std::hash::Hash; use std::sync::Arc; @@ -224,23 +224,21 @@ impl OrderingEquivalentClass { fn update_with_aliases(&mut self, columns_map: &HashMap>) { for (column, columns) in columns_map { let mut to_insert = vec![]; - for ordering in vec![&self.head].into_iter().chain(self.others.iter()) { - for (idx, elem) in ordering.iter().enumerate() { - if elem.col.eq(column) { + for ordering in std::iter::once(&self.head).chain(self.others.iter()) { + for (idx, item) in ordering.iter().enumerate() { + if item.col.eq(column) { for col in columns { let mut normalized = self.head.clone(); - // Change the corresponding entry in the head, with the alias column - normalized[idx] = OrderedColumn { - col: col.clone(), - options: elem.options, - }; + // Change the corresponding entry in the head with the alias column: + let entry = &mut normalized[idx]; + (entry.col, entry.options) = (col.clone(), item.options); to_insert.push(normalized); } } } } - for elems in to_insert { - self.insert(elems); + for items in to_insert { + self.insert(items); } } } @@ -273,7 +271,7 @@ pub fn project_equivalence_properties( } } - // Prune columns that no longer is in the schema from from the EquivalenceProperties. + // Prune columns that are no longer in the schema from equivalences. let schema = output_eq.schema(); let fields = schema.fields(); for class in eq_classes.iter_mut() { diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index 800d3e85320cc..f6ed43bcf2150 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -173,40 +173,41 @@ fn normalize_sort_requirement_with_equivalence_properties( sort_requirement } -// Searches `section` inside the `searched`. Returns each range where `section` is compatible with the corresponding slice in the `searched`. +/// This function searches for the slice `section` inside the slice `given`. +/// It returns each range where `section` is compatible with the corresponding +/// slice in `given`. fn get_compatible_ranges( - searched: &[PhysicalSortRequirement], + given: &[PhysicalSortRequirement], section: &[PhysicalSortRequirement], ) -> Vec> { let n_section = section.len(); - let n_end = if searched.len() >= n_section { - searched.len() - n_section + 1 + let n_end = if given.len() >= n_section { + given.len() - n_section + 1 } else { 0 }; - let mut res = vec![]; - for idx in 0..n_end { - let end = idx + n_section; - let matches = searched[idx..end] - .iter() - .zip(section) - .all(|(req, given)| given.compatible(req)); - if matches { - res.push(Range { start: idx, end }); - } - } - res + (0..n_end) + .filter_map(|idx| { + let end = idx + n_section; + given[idx..end] + .iter() + .zip(section) + .all(|(req, given)| given.compatible(req)) + .then_some(Range { start: idx, end }) + }) + .collect() } -// Removes duplicate entries inside the `in_data`, vector returned preserves insertion order. -fn collapse_vec(in_data: Vec) -> Vec { - let mut out_data = vec![]; - for elem in in_data { - if !out_data.contains(&elem) { - out_data.push(elem); +/// This function constructs a duplicate-free vector by filtering out duplicate +/// entries inside the given vector `input`. +fn collapse_vec(input: Vec) -> Vec { + let mut output = vec![]; + for item in input { + if !output.contains(&item) { + output.push(item); } } - out_data + output } pub fn normalize_sort_exprs( @@ -239,35 +240,33 @@ pub fn normalize_sort_requirements( }) .collect::>(); for ordering_eq_class in ordering_eq_properties { - for elem in ordering_eq_class.others() { - let elem: Vec = elem + for item in ordering_eq_class.others() { + let item = item .clone() .into_iter() .map(|elem| elem.into()) .collect::>(); - let ranges = get_compatible_ranges(&normalized_exprs, &elem); + let ranges = get_compatible_ranges(&normalized_exprs, &item); let mut offset: i64 = 0; for Range { start, end } in ranges { - let head: Vec = ordering_eq_class + let mut head = ordering_eq_class .head() .clone() .into_iter() .map(|elem| elem.into()) - .collect::>(); + .collect::>(); let updated_start = (start as i64 + offset) as usize; let updated_end = (end as i64 + offset) as usize; let range = end - start; offset += head.len() as i64 - range as i64; let all_none = normalized_exprs[updated_start..updated_end] .iter() - .all(|elem| elem.options.is_none()); - let head = if all_none { - head.into_iter() - .map(|elem| PhysicalSortRequirement::new(elem.expr, None)) - .collect::>() - } else { - head - }; + .all(|req| req.options.is_none()); + if all_none { + for req in head.iter_mut() { + req.options = None; + } + } normalized_exprs.splice(updated_start..updated_end, head); } } From 6a0159b8ee93e1d3445ea97a2e2a2bbc95b3fe58 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Tue, 23 May 2023 10:16:03 +0300 Subject: [PATCH 19/23] Update comments --- datafusion/core/src/physical_plan/windows/mod.rs | 2 ++ datafusion/physical-expr/src/equivalence.rs | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/core/src/physical_plan/windows/mod.rs index 3f92edc4be9d8..d4732cc1f604a 100644 --- a/datafusion/core/src/physical_plan/windows/mod.rs +++ b/datafusion/core/src/physical_plan/windows/mod.rs @@ -269,6 +269,8 @@ pub(crate) fn window_ordering_equivalence( item.expr.clone(), input.equivalence_properties().classes(), ); + // Currently we only support, ordering equivalences for `Column` expressions. + // TODO: Add support for ordering equivalence for all `PhysicalExpr`s if let Some(column) = normalized.as_any().downcast_ref::() { normalized_out_ordering .push(OrderedColumn::new(column.clone(), item.options)); diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 55062681690cb..d8878a270537e 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -221,6 +221,10 @@ impl From for PhysicalSortRequirement { pub type OrderingEquivalentClass = EquivalentClass>; impl OrderingEquivalentClass { + /// This function extends ordering equivalences with alias information. + // For instance, assume column a and b are aliases, + // and column (a ASC), (c DESC) are ordering equivalent. We append (b ASC) to ordering equivalence, + // since b is alias of colum a. After this function (a ASC), (c DESC), (b ASC) would be ordering equivalent. fn update_with_aliases(&mut self, columns_map: &HashMap>) { for (column, columns) in columns_map { let mut to_insert = vec![]; From c46c4cdf3a09f930147b3b05f931e77060b1133d Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 25 May 2023 10:45:21 +0300 Subject: [PATCH 20/23] Update comments --- datafusion/physical-expr/src/equivalence.rs | 17 ++++++++++++++--- datafusion/physical-expr/src/utils.rs | 16 ++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index d8878a270537e..a1f2df9208f07 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -218,13 +218,24 @@ impl From for PhysicalSortRequirement { } } +/// `Vec` stores the lexicographical ordering for a schema. +/// OrderingEquivalentClass keeps track of different alternative orderings than can +/// describe the schema. +/// For instance, for the table below +/// |a|b|c|d| +/// |1|4|3|1| +/// |2|3|3|2| +/// |3|1|2|2| +/// |3|2|1|3| +/// both `vec![a ASC, b ASC]` and `vec![c DESC, d ASC]` describe the ordering of the table. +/// For this case, we say that `vec![a ASC, b ASC]`, and `vec![c DESC, d ASC]` are ordering equivalent. pub type OrderingEquivalentClass = EquivalentClass>; impl OrderingEquivalentClass { /// This function extends ordering equivalences with alias information. - // For instance, assume column a and b are aliases, - // and column (a ASC), (c DESC) are ordering equivalent. We append (b ASC) to ordering equivalence, - // since b is alias of colum a. After this function (a ASC), (c DESC), (b ASC) would be ordering equivalent. + /// For instance, assume column a and b are aliases, + /// and column (a ASC), (c DESC) are ordering equivalent. We append (b ASC) to ordering equivalence, + /// since b is alias of colum a. After this function (a ASC), (c DESC), (b ASC) would be ordering equivalent. fn update_with_aliases(&mut self, columns_map: &HashMap>) { for (column, columns) in columns_map { let mut to_insert = vec![]; diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils.rs index f6ed43bcf2150..54a18a157304d 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils.rs @@ -210,6 +210,14 @@ fn collapse_vec(input: Vec) -> Vec { output } +/// Transform `sort_exprs` vector, to standardized version using `eq_properties` and `ordering_eq_properties` +/// Assume `eq_properties` states that `Column a` and `Column b` are aliases. +/// Also assume `ordering_eq_properties` states that ordering `vec![d ASC]` and `vec![a ASC, c ASC]` are +/// ordering equivalent (in the sense that both describe the ordering of the table). +/// If the `sort_exprs` input to this function were `vec![b ASC, c ASC]`, +/// This function converts `sort_exprs` `vec![b ASC, c ASC]` to first `vec![a ASC, c ASC]` after considering `eq_properties` +/// Then converts `vec![a ASC, c ASC]` to `vec![d ASC]` after considering `ordering_eq_properties`. +/// Standardized version `vec![d ASC]` is used in subsequent operations. pub fn normalize_sort_exprs( sort_exprs: &[PhysicalSortExpr], eq_properties: &[EquivalentClass], @@ -225,6 +233,14 @@ pub fn normalize_sort_exprs( collapse_vec(normalized_exprs) } +/// Transform `sort_reqs` vector, to standardized version using `eq_properties` and `ordering_eq_properties` +/// Assume `eq_properties` states that `Column a` and `Column b` are aliases. +/// Also assume `ordering_eq_properties` states that ordering `vec![d ASC]` and `vec![a ASC, c ASC]` are +/// ordering equivalent (in the sense that both describe the ordering of the table). +/// If the `sort_reqs` input to this function were `vec![b Some(ASC), c None]`, +/// This function converts `sort_exprs` `vec![b Some(ASC), c None]` to first `vec![a Some(ASC), c None]` after considering `eq_properties` +/// Then converts `vec![a Some(ASC), c None]` to `vec![d Some(ASC)]` after considering `ordering_eq_properties`. +/// Standardized version `vec![d Some(ASC)]` is used in subsequent operations. pub fn normalize_sort_requirements( sort_reqs: &[PhysicalSortRequirement], eq_properties: &[EquivalentClass], From 0e47213ed429471484dab1a4c74f00d6a5ed02a1 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 25 May 2023 11:45:36 +0300 Subject: [PATCH 21/23] Use builder style for ordering equivalence creation --- .../core/src/physical_plan/windows/mod.rs | 58 +++++--------- datafusion/physical-expr/src/equivalence.rs | 76 ++++++++++++++++++- 2 files changed, 92 insertions(+), 42 deletions(-) diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/core/src/physical_plan/windows/mod.rs index d4732cc1f604a..41685c3cf004e 100644 --- a/datafusion/core/src/physical_plan/windows/mod.rs +++ b/datafusion/core/src/physical_plan/windows/mod.rs @@ -47,14 +47,14 @@ mod window_agg_exec; pub use bounded_window_agg_exec::BoundedWindowAggExec; pub use bounded_window_agg_exec::PartitionSearchMode; use datafusion_common::utils::longest_consecutive_prefix; +use datafusion_physical_expr::equivalence::OrderingEquivalenceBuilder; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::utils::{convert_to_expr, get_indices_of_matching_exprs}; pub use datafusion_physical_expr::window::{ BuiltInWindowExpr, PlainAggregateWindowExpr, WindowExpr, }; use datafusion_physical_expr::{ - normalize_expr_with_equivalence_properties, OrderedColumn, - OrderingEquivalenceProperties, PhysicalSortRequirement, + OrderedColumn, OrderingEquivalenceProperties, PhysicalSortRequirement, }; pub use window_agg_exec::WindowAggExec; @@ -254,30 +254,10 @@ pub(crate) fn window_ordering_equivalence( ) -> OrderingEquivalenceProperties { // We need to update the schema, so we can not directly use // `input.ordering_equivalence_properties()`. - let mut result = OrderingEquivalenceProperties::new(schema.clone()); - result.extend( - input - .ordering_equivalence_properties() - .classes() - .iter() - .cloned(), - ); - let mut normalized_out_ordering = vec![]; - for item in input.output_ordering().unwrap_or(&[]) { - // To account for ordering equivalences, first normalize the expression: - let normalized = normalize_expr_with_equivalence_properties( - item.expr.clone(), - input.equivalence_properties().classes(), - ); - // Currently we only support, ordering equivalences for `Column` expressions. - // TODO: Add support for ordering equivalence for all `PhysicalExpr`s - if let Some(column) = normalized.as_any().downcast_ref::() { - normalized_out_ordering - .push(OrderedColumn::new(column.clone(), item.options)); - } else { - break; - } - } + let mut builder = OrderingEquivalenceBuilder::new(schema.clone()) + .with_equivalences(input.equivalence_properties()) + .with_existing_ordering(input.output_ordering().map(|elem| elem.to_vec())) + .extend(input.ordering_equivalence_properties()); for expr in window_expr { if let Some(builtin_window_expr) = expr.as_any().downcast_ref::() @@ -289,25 +269,21 @@ pub(crate) fn window_ordering_equivalence( .as_any() .is::() { - // If there is an existing ordering, add new ordering as an equivalence: - if !normalized_out_ordering.is_empty() { - if let Some((idx, field)) = - schema.column_with_name(expr.field().unwrap().name()) - { - let column = Column::new(field.name(), idx); - let options = SortOptions { - descending: false, - nulls_first: false, - }; // ASC, NULLS LAST - let rhs = OrderedColumn::new(column, options); - result - .add_equal_conditions((&normalized_out_ordering, &vec![rhs])); - } + if let Some((idx, field)) = + schema.column_with_name(expr.field().unwrap().name()) + { + let column = Column::new(field.name(), idx); + let options = SortOptions { + descending: false, + nulls_first: false, + }; // ASC, NULLS LAST + let rhs = OrderedColumn::new(column, options); + builder.add_equal_conditions(vec![rhs]); } } } } - result + builder.ordering_equivalence() } #[cfg(test)] mod tests { diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index a1f2df9208f07..33932faf19c07 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -16,7 +16,9 @@ // under the License. use crate::expressions::Column; -use crate::{PhysicalSortExpr, PhysicalSortRequirement}; +use crate::{ + normalize_expr_with_equivalence_properties, PhysicalSortExpr, PhysicalSortRequirement, +}; use arrow::datatypes::SchemaRef; use arrow_schema::SortOptions; @@ -259,6 +261,78 @@ impl OrderingEquivalentClass { } } +pub struct OrderingEquivalenceBuilder { + eq_properties: EquivalenceProperties, + ordering_eq_properties: OrderingEquivalenceProperties, + existing_ordering: Vec, +} + +impl OrderingEquivalenceBuilder { + pub fn new(schema: SchemaRef) -> Self { + let eq_properties = EquivalenceProperties::new(schema.clone()); + let ordering_eq_properties = OrderingEquivalenceProperties::new(schema); + Self { + eq_properties, + ordering_eq_properties, + existing_ordering: vec![], + } + } + + pub fn extend( + mut self, + new_ordering_eq_properties: OrderingEquivalenceProperties, + ) -> Self { + self.ordering_eq_properties + .extend(new_ordering_eq_properties.classes().iter().cloned()); + self + } + + pub fn with_existing_ordering( + mut self, + existing_ordering: Option>, + ) -> Self { + if let Some(existing_ordering) = existing_ordering { + self.existing_ordering = existing_ordering; + } + self + } + + pub fn with_equivalences(mut self, new_eq_properties: EquivalenceProperties) -> Self { + self.eq_properties = new_eq_properties; + self + } + + pub fn add_equal_conditions(&mut self, new_equivalent_ordering: Vec) { + let mut normalized_out_ordering = vec![]; + for item in &self.existing_ordering { + // To account for ordering equivalences, first normalize the expression: + let normalized = normalize_expr_with_equivalence_properties( + item.expr.clone(), + self.eq_properties.classes(), + ); + // Currently we only support, ordering equivalences for `Column` expressions. + // TODO: Add support for ordering equivalence for all `PhysicalExpr`s + if let Some(column) = normalized.as_any().downcast_ref::() { + normalized_out_ordering + .push(OrderedColumn::new(column.clone(), item.options)); + } else { + break; + } + } + // If there is an existing ordering, add new ordering as an equivalence: + if !normalized_out_ordering.is_empty() { + self.ordering_eq_properties.add_equal_conditions(( + &normalized_out_ordering, + &new_equivalent_ordering, + )); + } + } + + pub fn ordering_equivalence(&self) -> OrderingEquivalenceProperties { + self.ordering_eq_properties.clone() + } +} + /// This function applies the given projection to the given equivalence /// properties to compute the resulting (projected) equivalence properties; e.g. /// 1) Adding an alias, which can introduce additional equivalence properties, From e24749e8456330a152bf865341b16c3d4e50aa65 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Thu, 25 May 2023 21:11:16 -0500 Subject: [PATCH 22/23] Minor comment changes --- datafusion/physical-expr/src/equivalence.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 33932faf19c07..6557f2d2d3b60 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -261,6 +261,8 @@ impl OrderingEquivalentClass { } } +/// This is a builder object facilitating incremental construction +/// for ordering equivalences. pub struct OrderingEquivalenceBuilder { eq_properties: EquivalenceProperties, ordering_eq_properties: OrderingEquivalenceProperties, @@ -310,8 +312,8 @@ impl OrderingEquivalenceBuilder { item.expr.clone(), self.eq_properties.classes(), ); - // Currently we only support, ordering equivalences for `Column` expressions. - // TODO: Add support for ordering equivalence for all `PhysicalExpr`s + // Currently we only support ordering equivalences for `Column` expressions. + // TODO: Add support for ordering equivalence for all `PhysicalExpr`s. if let Some(column) = normalized.as_any().downcast_ref::() { normalized_out_ordering .push(OrderedColumn::new(column.clone(), item.options)); From ffdd069bde02d0e4ab6aa8cc8f90a0fa06fc61fb Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Fri, 26 May 2023 16:31:00 +0300 Subject: [PATCH 23/23] Address reviews --- datafusion/core/src/physical_plan/windows/mod.rs | 2 +- datafusion/physical-expr/src/equivalence.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/core/src/physical_plan/windows/mod.rs index 41685c3cf004e..d7eedf7f18ad8 100644 --- a/datafusion/core/src/physical_plan/windows/mod.rs +++ b/datafusion/core/src/physical_plan/windows/mod.rs @@ -283,7 +283,7 @@ pub(crate) fn window_ordering_equivalence( } } } - builder.ordering_equivalence() + builder.build() } #[cfg(test)] mod tests { diff --git a/datafusion/physical-expr/src/equivalence.rs b/datafusion/physical-expr/src/equivalence.rs index 6557f2d2d3b60..659d159c62145 100644 --- a/datafusion/physical-expr/src/equivalence.rs +++ b/datafusion/physical-expr/src/equivalence.rs @@ -330,8 +330,8 @@ impl OrderingEquivalenceBuilder { } } - pub fn ordering_equivalence(&self) -> OrderingEquivalenceProperties { - self.ordering_eq_properties.clone() + pub fn build(self) -> OrderingEquivalenceProperties { + self.ordering_eq_properties } }