From 6d157cc197fe2860b57c072cd7503539f1abe60d Mon Sep 17 00:00:00 2001 From: sriram Date: Sat, 1 Nov 2025 01:05:59 +0530 Subject: [PATCH 001/157] Feat: Make extract (date_part) timezone aware --- .../functions/src/datetime/date_part.rs | 108 +++++++++++++++++- .../sqllogictest/test_files/extract_tz.slt | 68 +++++++++++ 2 files changed, 172 insertions(+), 4 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/extract_tz.slt diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index aa23a5028dd8..9a2af8d83449 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -19,14 +19,21 @@ use std::any::Any; use std::str::FromStr; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, Float64Array, Int32Array}; +use arrow::array::{Array, ArrayRef, Float64Array, Int32Array, PrimitiveBuilder}; +use arrow::array::timezone::Tz; use arrow::compute::kernels::cast_utils::IntervalUnit; use arrow::compute::{binary, date_part, DatePart}; use arrow::datatypes::DataType::{ Date32, Date64, Duration, Interval, Time32, Time64, Timestamp, }; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; -use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit}; +use arrow::datatypes::{ + ArrowTimestampType, DataType, Field, FieldRef, TimeUnit, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, +}; +use chrono::{DateTime, MappedLocalTime, Offset, TimeDelta, TimeZone, Utc}; +use datafusion_common::cast::as_primitive_array; +use std::ops::Add; use datafusion_common::types::{logical_date, NativeType}; use datafusion_common::{ @@ -36,7 +43,7 @@ use datafusion_common::{ as_timestamp_microsecond_array, as_timestamp_millisecond_array, as_timestamp_nanosecond_array, as_timestamp_second_array, }, - exec_err, internal_err, not_impl_err, + exec_err, internal_datafusion_err, internal_err, not_impl_err, types::logical_string, utils::take_function_args, Result, ScalarValue, @@ -56,7 +63,7 @@ use datafusion_macros::user_doc; argument( name = "part", description = r#"Part of the date to return. The following date parts are supported: - + - year - quarter (emits value in inclusive range [1, 4] based on which quartile of the year the date is in) - month @@ -173,6 +180,7 @@ impl ScalarUDFImpl for DatePartFunc { &self, args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let config = &args.config_options; let args = args.args; let [part, array] = take_function_args(self.name(), args)?; @@ -193,6 +201,35 @@ impl ScalarUDFImpl for DatePartFunc { ColumnarValue::Scalar(scalar) => scalar.to_array()?, }; + // Adjust timestamps for timezone-aware extraction + let array = if let Timestamp(time_unit, Some(tz_str)) = array.data_type() { + // For timezone-aware timestamps, extract in their own timezone + let tz = match tz_str.parse::() { + Ok(tz) => tz, + Err(_) => return exec_err!("Invalid timezone"), + }; + match time_unit { + Nanosecond => adjust_timestamp_array::(&array, tz)?, + Microsecond => adjust_timestamp_array::(&array, tz)?, + Millisecond => adjust_timestamp_array::(&array, tz)?, + Second => adjust_timestamp_array::(&array, tz)?, + } + } else if let Timestamp(time_unit, None) = array.data_type() { + // For naive timestamps, interpret in session timezone + let tz = match config.execution.time_zone.parse::() { + Ok(tz) => tz, + Err(_) => return exec_err!("Invalid timezone"), + }; + match time_unit { + Nanosecond => adjust_timestamp_array::(&array, tz)?, + Microsecond => adjust_timestamp_array::(&array, tz)?, + Millisecond => adjust_timestamp_array::(&array, tz)?, + Second => adjust_timestamp_array::(&array, tz)?, + } + } else { + array + }; + let part_trim = part_normalization(&part); // using IntervalUnit here means we hand off all the work of supporting plurals (like "seconds") @@ -240,6 +277,69 @@ impl ScalarUDFImpl for DatePartFunc { } } +fn adjust_to_local_time(ts: i64, tz: Tz) -> Result { + fn convert_timestamp(ts: i64, converter: F) -> Result> + where + F: Fn(i64) -> MappedLocalTime>, + { + match converter(ts) { + MappedLocalTime::Ambiguous(earliest, latest) => exec_err!( + "Ambiguous timestamp. Do you mean {:?} or {:?}", + earliest, + latest + ), + MappedLocalTime::None => exec_err!( + "The local time does not exist because there is a gap in the local time." + ), + MappedLocalTime::Single(date_time) => Ok(date_time), + } + } + + let date_time = match T::UNIT { + Nanosecond => Utc.timestamp_nanos(ts), + Microsecond => convert_timestamp(ts, |ts| Utc.timestamp_micros(ts))?, + Millisecond => convert_timestamp(ts, |ts| Utc.timestamp_millis_opt(ts))?, + Second => convert_timestamp(ts, |ts| Utc.timestamp_opt(ts, 0))?, + }; + + let offset_seconds: i64 = tz + .offset_from_utc_datetime(&date_time.naive_utc()) + .fix() + .local_minus_utc() as i64; + + let adjusted_date_time = date_time.add( + TimeDelta::try_seconds(offset_seconds) + .ok_or_else(|| internal_datafusion_err!("Offset seconds should be less than i64::MAX / 1_000 or greater than -i64::MAX / 1_000"))?, + ); + + // convert back to i64 + match T::UNIT { + Nanosecond => adjusted_date_time.timestamp_nanos_opt().ok_or_else(|| { + internal_datafusion_err!( + "Failed to convert DateTime to timestamp in nanosecond. This error may occur if the date is out of range. The supported date ranges are between 1677-09-21T00:12:43.145224192 and 2262-04-11T23:47:16.854775807" + ) + }), + Microsecond => Ok(adjusted_date_time.timestamp_micros()), + Millisecond => Ok(adjusted_date_time.timestamp_millis()), + Second => Ok(adjusted_date_time.timestamp()), + } +} + +fn adjust_timestamp_array(array: &ArrayRef, tz: Tz) -> Result { + let mut builder = PrimitiveBuilder::::new(); + let primitive_array = as_primitive_array::(array)?; + for ts_opt in primitive_array.iter() { + match ts_opt { + None => builder.append_null(), + Some(ts) => { + let adjusted_ts = adjust_to_local_time::(ts, tz)?; + builder.append_value(adjusted_ts); + } + } + } + Ok(Arc::new(builder.finish())) +} + fn is_epoch(part: &str) -> bool { let part = part_normalization(part); matches!(part.to_lowercase().as_str(), "epoch") diff --git a/datafusion/sqllogictest/test_files/extract_tz.slt b/datafusion/sqllogictest/test_files/extract_tz.slt new file mode 100644 index 000000000000..2064cae07aa0 --- /dev/null +++ b/datafusion/sqllogictest/test_files/extract_tz.slt @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for timezone-aware date_part functionality + +# Test with different timezone +statement ok +SET datafusion.execution.time_zone = '-03:00'; + +query I +SELECT EXTRACT(HOUR FROM TIMESTAMP '2025-11-18 10:00:00'); +---- +7 + +query II +SELECT EXTRACT(MINUTE FROM TIMESTAMP '2023-10-30 10:45:30'), + EXTRACT(SECOND FROM TIMESTAMP '2023-10-30 10:45:30'); +---- +45 30 + +query III +SELECT EXTRACT(YEAR FROM DATE '2023-10-30'), + EXTRACT(MONTH FROM DATE '2023-10-30'), + EXTRACT(DAY FROM DATE '2023-10-30'); +---- +2023 10 30 + +query I +SELECT EXTRACT(HOUR FROM CAST(NULL AS TIMESTAMP)); +---- +NULL + +statement ok +SET datafusion.execution.time_zone = '+04:00'; + +query I +SELECT EXTRACT(HOUR FROM TIMESTAMP '2023-10-30 02:00:00'); +---- +6 + +query III +SELECT EXTRACT(HOUR FROM TIMESTAMP '2023-10-30 18:20:59'), + EXTRACT(MINUTE FROM TIMESTAMP '2023-10-30 18:20:59'), + EXTRACT(SECOND FROM TIMESTAMP '2023-10-30 18:20:59'); +---- +22 20 59 + +query II +SELECT EXTRACT(DOW FROM DATE '2025-11-01'), + EXTRACT(DOY FROM DATE '2026-12-31'); +---- +6 365 + + From 3f2e3787ae4c989be5f087b2e09474ae76647b86 Mon Sep 17 00:00:00 2001 From: sriram Date: Sat, 1 Nov 2025 01:12:14 +0530 Subject: [PATCH 002/157] Format files. --- .../functions/src/datetime/date_part.rs | 33 ++++++++++++++----- .../sqllogictest/test_files/extract_tz.slt | 3 +- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 9a2af8d83449..b14c57d3a0f2 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -19,8 +19,8 @@ use std::any::Any; use std::str::FromStr; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, Float64Array, Int32Array, PrimitiveBuilder}; use arrow::array::timezone::Tz; +use arrow::array::{Array, ArrayRef, Float64Array, Int32Array, PrimitiveBuilder}; use arrow::compute::kernels::cast_utils::IntervalUnit; use arrow::compute::{binary, date_part, DatePart}; use arrow::datatypes::DataType::{ @@ -33,8 +33,8 @@ use arrow::datatypes::{ }; use chrono::{DateTime, MappedLocalTime, Offset, TimeDelta, TimeZone, Utc}; use datafusion_common::cast::as_primitive_array; -use std::ops::Add; use datafusion_common::types::{logical_date, NativeType}; +use std::ops::Add; use datafusion_common::{ cast::{ @@ -209,9 +209,15 @@ impl ScalarUDFImpl for DatePartFunc { Err(_) => return exec_err!("Invalid timezone"), }; match time_unit { - Nanosecond => adjust_timestamp_array::(&array, tz)?, - Microsecond => adjust_timestamp_array::(&array, tz)?, - Millisecond => adjust_timestamp_array::(&array, tz)?, + Nanosecond => { + adjust_timestamp_array::(&array, tz)? + } + Microsecond => { + adjust_timestamp_array::(&array, tz)? + } + Millisecond => { + adjust_timestamp_array::(&array, tz)? + } Second => adjust_timestamp_array::(&array, tz)?, } } else if let Timestamp(time_unit, None) = array.data_type() { @@ -221,9 +227,15 @@ impl ScalarUDFImpl for DatePartFunc { Err(_) => return exec_err!("Invalid timezone"), }; match time_unit { - Nanosecond => adjust_timestamp_array::(&array, tz)?, - Microsecond => adjust_timestamp_array::(&array, tz)?, - Millisecond => adjust_timestamp_array::(&array, tz)?, + Nanosecond => { + adjust_timestamp_array::(&array, tz)? + } + Microsecond => { + adjust_timestamp_array::(&array, tz)? + } + Millisecond => { + adjust_timestamp_array::(&array, tz)? + } Second => adjust_timestamp_array::(&array, tz)?, } } else { @@ -325,7 +337,10 @@ fn adjust_to_local_time(ts: i64, tz: Tz) -> Result { } } -fn adjust_timestamp_array(array: &ArrayRef, tz: Tz) -> Result { +fn adjust_timestamp_array( + array: &ArrayRef, + tz: Tz, +) -> Result { let mut builder = PrimitiveBuilder::::new(); let primitive_array = as_primitive_array::(array)?; for ts_opt in primitive_array.iter() { diff --git a/datafusion/sqllogictest/test_files/extract_tz.slt b/datafusion/sqllogictest/test_files/extract_tz.slt index 2064cae07aa0..9a03236b6609 100644 --- a/datafusion/sqllogictest/test_files/extract_tz.slt +++ b/datafusion/sqllogictest/test_files/extract_tz.slt @@ -15,8 +15,7 @@ # specific language governing permissions and limitations # under the License. -# Tests for timezone-aware date_part functionality - +# Tests for timezone-aware extract SQL statement support. # Test with different timezone statement ok SET datafusion.execution.time_zone = '-03:00'; From 97b0524e9257b96829ef038efed293b16b3cb19a Mon Sep 17 00:00:00 2001 From: sriram Date: Sat, 1 Nov 2025 23:40:00 +0530 Subject: [PATCH 003/157] Make extract() time aware and register it as an independent function instead of going through date_part() --- .../functions/src/datetime/date_part.rs | 46 ++++++----- datafusion/functions/src/datetime/mod.rs | 3 + datafusion/functions/src/datetime/planner.rs | 2 +- datafusion/sql/src/expr/mod.rs | 77 ++++++++++++------- .../sqllogictest/test_files/extract_tz.slt | 26 +++++++ .../sqllogictest/test_files/group_by.slt | 12 +-- .../optimizer_group_by_constant.slt | 2 +- .../test_files/table_functions.slt | 4 +- 8 files changed, 118 insertions(+), 54 deletions(-) diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index b14c57d3a0f2..4754589ad19d 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -27,10 +27,7 @@ use arrow::datatypes::DataType::{ Date32, Date64, Duration, Interval, Time32, Time64, Timestamp, }; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; -use arrow::datatypes::{ - ArrowTimestampType, DataType, Field, FieldRef, TimeUnit, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, -}; +use arrow::datatypes::{ArrowTimestampType, DataType, Field, FieldRef, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType}; use chrono::{DateTime, MappedLocalTime, Offset, TimeDelta, TimeZone, Utc}; use datafusion_common::cast::as_primitive_array; use datafusion_common::types::{logical_date, NativeType}; @@ -201,24 +198,34 @@ impl ScalarUDFImpl for DatePartFunc { ColumnarValue::Scalar(scalar) => scalar.to_array()?, }; - // Adjust timestamps for timezone-aware extraction - let array = if let Timestamp(time_unit, Some(tz_str)) = array.data_type() { + let (is_timezone_aware, tz_str_opt) = match array.data_type() { + Timestamp(_, Some(tz_str)) => (true, Some(tz_str.clone())), + _ => (false, None), + }; + + // Adjust timestamps for extraction + let array = if is_timezone_aware { // For timezone-aware timestamps, extract in their own timezone + let tz_str = tz_str_opt.as_ref().unwrap(); let tz = match tz_str.parse::() { Ok(tz) => tz, Err(_) => return exec_err!("Invalid timezone"), }; - match time_unit { - Nanosecond => { - adjust_timestamp_array::(&array, tz)? - } - Microsecond => { - adjust_timestamp_array::(&array, tz)? - } - Millisecond => { - adjust_timestamp_array::(&array, tz)? - } - Second => adjust_timestamp_array::(&array, tz)?, + match array.data_type() { + Timestamp(time_unit, _) => match time_unit { + Nanosecond => { + adjust_timestamp_array::(&array, tz)? + } + Microsecond => { + adjust_timestamp_array::(&array, tz)? + } + Millisecond => { + adjust_timestamp_array::(&array, tz)? + } + Second => adjust_timestamp_array::(&array, tz)?, + _ => array, + }, + _ => array, } } else if let Timestamp(time_unit, None) = array.data_type() { // For naive timestamps, interpret in session timezone @@ -237,6 +244,7 @@ impl ScalarUDFImpl for DatePartFunc { adjust_timestamp_array::(&array, tz)? } Second => adjust_timestamp_array::(&array, tz)?, + _ => array, } } else { array @@ -246,7 +254,7 @@ impl ScalarUDFImpl for DatePartFunc { // using IntervalUnit here means we hand off all the work of supporting plurals (like "seconds") // and synonyms ( like "ms,msec,msecond,millisecond") to Arrow - let arr = if let Ok(interval_unit) = IntervalUnit::from_str(part_trim) { + let mut arr = if let Ok(interval_unit) = IntervalUnit::from_str(part_trim) { match interval_unit { IntervalUnit::Year => date_part(array.as_ref(), DatePart::Year)?, IntervalUnit::Month => date_part(array.as_ref(), DatePart::Month)?, @@ -273,6 +281,8 @@ impl ScalarUDFImpl for DatePartFunc { } }; + + Ok(if is_scalar { ColumnarValue::Scalar(ScalarValue::try_from_array(arr.as_ref(), 0)?) } else { diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index d80f14facf82..a842b6d7a9d5 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -27,6 +27,7 @@ pub mod current_time; pub mod date_bin; pub mod date_part; pub mod date_trunc; +pub mod extract; pub mod from_unixtime; pub mod make_date; pub mod now; @@ -43,6 +44,7 @@ make_udf_function!(current_time::CurrentTimeFunc, current_time); make_udf_function!(date_bin::DateBinFunc, date_bin); make_udf_function!(date_part::DatePartFunc, date_part); make_udf_function!(date_trunc::DateTruncFunc, date_trunc); +make_udf_function!(extract::ExtractFunc, extract); make_udf_function!(make_date::MakeDateFunc, make_date); make_udf_function!(from_unixtime::FromUnixtimeFunc, from_unixtime); make_udf_function!(to_char::ToCharFunc, to_char); @@ -265,6 +267,7 @@ pub fn functions() -> Vec> { date_bin(), date_part(), date_trunc(), + extract(), from_unixtime(), make_date(), now(&ConfigOptions::default()), diff --git a/datafusion/functions/src/datetime/planner.rs b/datafusion/functions/src/datetime/planner.rs index f4b64c3711e2..20442d0205a2 100644 --- a/datafusion/functions/src/datetime/planner.rs +++ b/datafusion/functions/src/datetime/planner.rs @@ -29,7 +29,7 @@ impl ExprPlanner for DatetimeFunctionPlanner { args: Vec, ) -> datafusion_common::Result>> { Ok(PlannerResult::Planned(Expr::ScalarFunction( - ScalarFunction::new_udf(crate::datetime::date_part(), args), + ScalarFunction::new_udf(crate::datetime::extract(), args), ))) } } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index fef0505e993f..350f65019c0d 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -16,6 +16,7 @@ // under the License. use arrow::datatypes::{DataType, TimeUnit}; +use std::sync::Arc; use datafusion_expr::planner::{ PlannerResult, RawBinaryExpr, RawDictionaryExpr, RawFieldAccessExpr, }; @@ -294,15 +295,24 @@ impl SqlToRel<'_, S> { } SQLExpr::TypedString(TypedString { - data_type, - value, - uses_odbc_syntax: _, - }) => Ok(Expr::Cast(Cast::new( - Box::new(lit(value.into_string().unwrap())), - self.convert_data_type_to_field(&data_type)? + data_type, + value, + uses_odbc_syntax: _, + }) => { + let string_value = value.into_string().unwrap(); + let mut cast_data_type = self.convert_data_type_to_field(&data_type)? .data_type() - .clone(), - ))), + .clone(); + if let DataType::Timestamp(time_unit, None) = &cast_data_type { + if let Some(tz) = extract_tz_from_string(&string_value) { + cast_data_type = DataType::Timestamp(*time_unit, Some(Arc::from(tz))); + } + } + Ok(Expr::Cast(Cast::new( + Box::new(lit(string_value)), + cast_data_type, + ))) + } SQLExpr::IsNull(expr) => Ok(Expr::IsNull(Box::new( self.sql_expr_to_logical_expr(*expr, schema, planner_context)?, @@ -554,9 +564,9 @@ impl SqlToRel<'_, S> { )?), match *time_zone { SQLExpr::Value(ValueWithSpan { - value: Value::SingleQuotedString(s), - span: _, - }) => DataType::Timestamp(TimeUnit::Nanosecond, Some(s.into())), + value: Value::SingleQuotedString(s), + span: _, + }) => DataType::Timestamp(TimeUnit::Nanosecond, Some(s.into())), _ => { return not_impl_err!( "Unsupported ast node in sqltorel: {time_zone:?}" @@ -980,13 +990,13 @@ impl SqlToRel<'_, S> { // to align with postgres / duckdb semantics let expr = match dt.data_type() { DataType::Timestamp(TimeUnit::Nanosecond, tz) - if expr.get_type(schema)? == DataType::Int64 => - { - Expr::Cast(Cast::new( - Box::new(expr), - DataType::Timestamp(TimeUnit::Second, tz.clone()), - )) - } + if expr.get_type(schema)? == DataType::Int64 => + { + Expr::Cast(Cast::new( + Box::new(expr), + DataType::Timestamp(TimeUnit::Second, tz.clone()), + )) + } _ => expr, }; @@ -1078,11 +1088,11 @@ impl SqlToRel<'_, S> { // index can be a name, in which case it is a named field access match index { SQLExpr::Value(ValueWithSpan { - value: - Value::SingleQuotedString(s) - | Value::DoubleQuotedString(s), - span: _, - }) => Ok(Some(GetFieldAccess::NamedStructField { + value: + Value::SingleQuotedString(s) + | Value::DoubleQuotedString(s), + span: _, + }) => Ok(Some(GetFieldAccess::NamedStructField { name: ScalarValue::from(s), })), SQLExpr::JsonAccess { .. } => { @@ -1146,9 +1156,9 @@ impl SqlToRel<'_, S> { } AccessExpr::Dot(expr) => match expr { SQLExpr::Value(ValueWithSpan { - value: Value::SingleQuotedString(s) | Value::DoubleQuotedString(s), - span : _ - }) => Ok(Some(GetFieldAccess::NamedStructField { + value: Value::SingleQuotedString(s) | Value::DoubleQuotedString(s), + span : _ + }) => Ok(Some(GetFieldAccess::NamedStructField { name: ScalarValue::from(s), })), _ => { @@ -1180,6 +1190,21 @@ impl SqlToRel<'_, S> { } } +fn extract_tz_from_string(s: &str) -> Option { + if let Some(pos) = s.rfind(|c| c == '+' || c == '-') { + let tz_str = &s[pos..]; + if tz_str.len() == 6 && tz_str.chars().nth(3) == Some(':') { + Some(tz_str.to_string()) + } else { + None + } + } else if s.ends_with('Z') { + Some("+00:00".to_string()) + } else { + None + } +} + #[cfg(test)] mod tests { use std::collections::HashMap; diff --git a/datafusion/sqllogictest/test_files/extract_tz.slt b/datafusion/sqllogictest/test_files/extract_tz.slt index 9a03236b6609..32e6b0fbfbb6 100644 --- a/datafusion/sqllogictest/test_files/extract_tz.slt +++ b/datafusion/sqllogictest/test_files/extract_tz.slt @@ -64,4 +64,30 @@ SELECT EXTRACT(DOW FROM DATE '2025-11-01'), ---- 6 365 +statement ok +SET datafusion.execution.time_zone = '+00:00'; + +query I +SELECT EXTRACT(HOUR FROM TIMESTAMP '2025-10-30 10:45:30+02:00'); +---- +12 + +query I +SELECT EXTRACT(HOUR FROM TIMESTAMP '2025-10-30 10:45:30-05:00'); +---- +5 + +query II +SELECT EXTRACT(YEAR FROM TIMESTAMP '2026-11-30 10:45:30Z'), + EXTRACT(MONTH FROM TIMESTAMP '2023-10-30 10:45:30Z'); +---- +2026 10 + +query III +SELECT EXTRACT(HOUR FROM TIMESTAMP '2023-10-30 18:20:59+04:00'), + EXTRACT(MINUTE FROM TIMESTAMP '2023-10-30 18:20:59+04:00'), + EXTRACT(SECOND FROM TIMESTAMP '2023-10-30 18:20:59+04:00'); +---- +22 20 59 + diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index b72f73d44698..7a9dfe151961 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4345,17 +4345,17 @@ EXPLAIN SELECT extract(month from ts) as months ---- logical_plan 01)Sort: months DESC NULLS FIRST, fetch=5 -02)--Projection: date_part(Utf8("MONTH"),csv_with_timestamps.ts) AS months -03)----Aggregate: groupBy=[[date_part(Utf8("MONTH"), csv_with_timestamps.ts)]], aggr=[[]] +02)--Projection: extract(Utf8("MONTH"),csv_with_timestamps.ts) AS months +03)----Aggregate: groupBy=[[extract(Utf8("MONTH"), csv_with_timestamps.ts)]], aggr=[[]] 04)------TableScan: csv_with_timestamps projection=[ts] physical_plan 01)SortPreservingMergeExec: [months@0 DESC], fetch=5 02)--SortExec: TopK(fetch=5), expr=[months@0 DESC], preserve_partitioning=[true] -03)----ProjectionExec: expr=[date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0 as months] -04)------AggregateExec: mode=FinalPartitioned, gby=[date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0 as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[] +03)----ProjectionExec: expr=[extract(Utf8("MONTH"),csv_with_timestamps.ts)@0 as months] +04)------AggregateExec: mode=FinalPartitioned, gby=[extract(Utf8("MONTH"),csv_with_timestamps.ts)@0 as extract(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[] 05)--------CoalesceBatchesExec: target_batch_size=2 -06)----------RepartitionExec: partitioning=Hash([date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0], 8), input_partitions=8 -07)------------AggregateExec: mode=Partial, gby=[date_part(MONTH, ts@0) as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[] +06)----------RepartitionExec: partitioning=Hash([extract(Utf8("MONTH"),csv_with_timestamps.ts)@0], 8), input_partitions=8 +07)------------AggregateExec: mode=Partial, gby=[extract(MONTH, ts@0) as extract(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[] 08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/timestamps.csv]]}, projection=[ts], output_ordering=[ts@0 DESC], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt b/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt index de6a153f58d9..9a666595ac57 100644 --- a/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt +++ b/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt @@ -90,7 +90,7 @@ FROM test_table t GROUP BY 1 ---- logical_plan -01)Projection: Boolean(true) AS NOT date_part(Utf8("MONTH"),now()) BETWEEN Int64(50) AND Int64(60), count(Int64(1)) +01)Projection: Boolean(true) AS NOT extract(Utf8("MONTH"),now()) BETWEEN Int64(50) AND Int64(60), count(Int64(1)) 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] 03)----SubqueryAlias: t 04)------TableScan: test_table projection=[] diff --git a/datafusion/sqllogictest/test_files/table_functions.slt b/datafusion/sqllogictest/test_files/table_functions.slt index 0159abe8d06b..484004c14e03 100644 --- a/datafusion/sqllogictest/test_files/table_functions.slt +++ b/datafusion/sqllogictest/test_files/table_functions.slt @@ -353,8 +353,8 @@ SELECT * FROM generate_series(TIMESTAMP '2023-01-01T00:00:00', TIMESTAMP '2023-0 query P SELECT * FROM range(TIMESTAMP '2023-01-01T00:00:00+00:00', TIMESTAMP '2023-01-03T00:00:00+00:00', INTERVAL '1' DAY) ---- -2023-01-01T00:00:00 -2023-01-02T00:00:00 +2023-01-01T00:00:00Z +2023-01-02T00:00:00Z # Negative timestamp range (going backwards) query P From 1b7f8f59cb421ea03ca8d8922e01cc19f15cb520 Mon Sep 17 00:00:00 2001 From: sriram Date: Sat, 1 Nov 2025 23:41:47 +0530 Subject: [PATCH 004/157] cargo fmt --- .../functions/src/datetime/date_part.rs | 7 ++-- datafusion/sql/src/expr/mod.rs | 36 ++++++++++--------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 4754589ad19d..dc9a1d7b5ae1 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -27,7 +27,10 @@ use arrow::datatypes::DataType::{ Date32, Date64, Duration, Interval, Time32, Time64, Timestamp, }; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; -use arrow::datatypes::{ArrowTimestampType, DataType, Field, FieldRef, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType}; +use arrow::datatypes::{ + ArrowTimestampType, DataType, Field, FieldRef, TimeUnit, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, +}; use chrono::{DateTime, MappedLocalTime, Offset, TimeDelta, TimeZone, Utc}; use datafusion_common::cast::as_primitive_array; use datafusion_common::types::{logical_date, NativeType}; @@ -281,8 +284,6 @@ impl ScalarUDFImpl for DatePartFunc { } }; - - Ok(if is_scalar { ColumnarValue::Scalar(ScalarValue::try_from_array(arr.as_ref(), 0)?) } else { diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 350f65019c0d..5423966bb0b3 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -16,7 +16,6 @@ // under the License. use arrow::datatypes::{DataType, TimeUnit}; -use std::sync::Arc; use datafusion_expr::planner::{ PlannerResult, RawBinaryExpr, RawDictionaryExpr, RawFieldAccessExpr, }; @@ -25,6 +24,7 @@ use sqlparser::ast::{ DictionaryField, Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias, MapEntry, StructField, Subscript, TrimWhereField, TypedString, Value, ValueWithSpan, }; +use std::sync::Arc; use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, Result, @@ -295,17 +295,19 @@ impl SqlToRel<'_, S> { } SQLExpr::TypedString(TypedString { - data_type, - value, - uses_odbc_syntax: _, - }) => { + data_type, + value, + uses_odbc_syntax: _, + }) => { let string_value = value.into_string().unwrap(); - let mut cast_data_type = self.convert_data_type_to_field(&data_type)? + let mut cast_data_type = self + .convert_data_type_to_field(&data_type)? .data_type() .clone(); if let DataType::Timestamp(time_unit, None) = &cast_data_type { if let Some(tz) = extract_tz_from_string(&string_value) { - cast_data_type = DataType::Timestamp(*time_unit, Some(Arc::from(tz))); + cast_data_type = + DataType::Timestamp(*time_unit, Some(Arc::from(tz))); } } Ok(Expr::Cast(Cast::new( @@ -564,9 +566,9 @@ impl SqlToRel<'_, S> { )?), match *time_zone { SQLExpr::Value(ValueWithSpan { - value: Value::SingleQuotedString(s), - span: _, - }) => DataType::Timestamp(TimeUnit::Nanosecond, Some(s.into())), + value: Value::SingleQuotedString(s), + span: _, + }) => DataType::Timestamp(TimeUnit::Nanosecond, Some(s.into())), _ => { return not_impl_err!( "Unsupported ast node in sqltorel: {time_zone:?}" @@ -990,13 +992,13 @@ impl SqlToRel<'_, S> { // to align with postgres / duckdb semantics let expr = match dt.data_type() { DataType::Timestamp(TimeUnit::Nanosecond, tz) - if expr.get_type(schema)? == DataType::Int64 => - { - Expr::Cast(Cast::new( - Box::new(expr), - DataType::Timestamp(TimeUnit::Second, tz.clone()), - )) - } + if expr.get_type(schema)? == DataType::Int64 => + { + Expr::Cast(Cast::new( + Box::new(expr), + DataType::Timestamp(TimeUnit::Second, tz.clone()), + )) + } _ => expr, }; From 924e33f11ad65b5db7601a76caee4b1d56e35580 Mon Sep 17 00:00:00 2001 From: sriram Date: Sat, 1 Nov 2025 23:46:12 +0530 Subject: [PATCH 005/157] add extract.rs --- datafusion/functions/src/datetime/extract.rs | 527 +++++++++++++++++++ 1 file changed, 527 insertions(+) create mode 100644 datafusion/functions/src/datetime/extract.rs diff --git a/datafusion/functions/src/datetime/extract.rs b/datafusion/functions/src/datetime/extract.rs new file mode 100644 index 000000000000..ccea202a0b92 --- /dev/null +++ b/datafusion/functions/src/datetime/extract.rs @@ -0,0 +1,527 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::str::FromStr; +use std::sync::Arc; + +use arrow::array::timezone::Tz; +use arrow::array::{Array, ArrayRef, Float64Array, Int32Array, PrimitiveBuilder}; +use arrow::compute::kernels::cast_utils::IntervalUnit; +use arrow::compute::{binary, date_part, DatePart}; +use arrow::datatypes::DataType::{ + Date32, Date64, Duration, Interval, Time32, Time64, Timestamp, +}; +use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; +use arrow::datatypes::{ + ArrowTimestampType, DataType, Field, FieldRef, Int32Type, TimeUnit, + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, +}; +use chrono::{DateTime, MappedLocalTime, Offset, TimeDelta, TimeZone, Utc}; +use datafusion_common::cast::as_primitive_array; +use datafusion_common::types::{logical_date, NativeType}; +use std::ops::Add; + +use datafusion_common::{ + cast::{ + as_date32_array, as_date64_array, as_int32_array, as_time32_millisecond_array, + as_time32_second_array, as_time64_microsecond_array, as_time64_nanosecond_array, + as_timestamp_microsecond_array, as_timestamp_millisecond_array, + as_timestamp_nanosecond_array, as_timestamp_second_array, + }, + exec_err, internal_datafusion_err, internal_err, not_impl_err, + types::logical_string, + utils::take_function_args, + Result, ScalarValue, +}; +use datafusion_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, + TypeSignature, Volatility, +}; +use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; +use datafusion_macros::user_doc; + +#[user_doc( + doc_section(label = "Time and Date Functions"), + description = "Returns the specified part of the date as an integer.", + syntax_example = "extract(field FROM source)", + argument( + name = "field", + description = r#"Part of the date to return. The following date parts are supported: + +- year +- quarter (emits value in inclusive range [1, 4] based on which quartile of the year the date is in) +- month +- week (week of the year) +- day (day of the month) +- hour +- minute +- second +- millisecond +- microsecond +- nanosecond +- dow (day of the week where Sunday is 0) +- doy (day of the year) +- epoch (seconds since Unix epoch) +- isodow (day of the week where Monday is 0) +"# + ), + argument( + name = "source", + description = "Time expression to operate on. Can be a constant, column, or function." + ) +)] +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct ExtractFunc { + signature: Signature, +} + +impl Default for ExtractFunc { + fn default() -> Self { + Self::new() + } +} + +impl ExtractFunc { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_implicit( + TypeSignatureClass::Timestamp, + // Not consistent with Postgres and DuckDB but to avoid regression we implicit cast string to timestamp + vec![TypeSignatureClass::Native(logical_string())], + NativeType::Timestamp(Nanosecond, None), + ), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_date())), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Time), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Interval), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Duration), + ]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for ExtractFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "extract" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + internal_err!("return_field_from_args should be called instead") + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + let [field, _] = take_function_args(self.name(), args.scalar_arguments)?; + + field + .and_then(|sv| { + sv.try_as_str() + .flatten() + .filter(|s| !s.is_empty()) + .map(|part| { + if is_epoch(part) { + Field::new(self.name(), DataType::Float64, true) + } else { + Field::new(self.name(), DataType::Int32, true) + } + }) + }) + .map(Arc::new) + .map_or_else( + || exec_err!("{} requires non-empty constant string", self.name()), + Ok, + ) + } + + fn invoke_with_args( + &self, + args: datafusion_expr::ScalarFunctionArgs, + ) -> Result { + let config = &args.config_options; + let args = args.args; + let [part, array] = take_function_args(self.name(), args)?; + + let part = if let ColumnarValue::Scalar(ScalarValue::Utf8(Some(v))) = part { + v + } else if let ColumnarValue::Scalar(ScalarValue::Utf8View(Some(v))) = part { + v + } else { + return exec_err!("First argument of `EXTRACT` must be non-null scalar Utf8"); + }; + + let is_scalar = matches!(array, ColumnarValue::Scalar(_)); + + let array = match array { + ColumnarValue::Array(array) => Arc::clone(&array), + ColumnarValue::Scalar(scalar) => scalar.to_array()?, + }; + + let (is_timezone_aware, tz_str_opt) = match array.data_type() { + Timestamp(_, Some(tz_str)) => (true, Some(tz_str.clone())), + _ => (false, None), + }; + + // Adjust timestamps for extraction + let array = if is_timezone_aware { + // For timezone-aware timestamps, extract in their own timezone + let tz_str = tz_str_opt.as_ref().unwrap(); + let tz = match tz_str.parse::() { + Ok(tz) => tz, + Err(_) => return exec_err!("Invalid timezone"), + }; + match array.data_type() { + Timestamp(time_unit, _) => match time_unit { + Nanosecond => { + adjust_timestamp_array::(&array, tz)? + } + Microsecond => { + adjust_timestamp_array::(&array, tz)? + } + Millisecond => { + adjust_timestamp_array::(&array, tz)? + } + Second => adjust_timestamp_array::(&array, tz)?, + _ => array, + }, + _ => array, + } + } else if let Timestamp(time_unit, None) = array.data_type() { + // For naive timestamps, interpret in session timezone + let tz = match config.execution.time_zone.parse::() { + Ok(tz) => tz, + Err(_) => return exec_err!("Invalid timezone"), + }; + match time_unit { + Nanosecond => { + adjust_timestamp_array::(&array, tz)? + } + Microsecond => { + adjust_timestamp_array::(&array, tz)? + } + Millisecond => { + adjust_timestamp_array::(&array, tz)? + } + Second => adjust_timestamp_array::(&array, tz)?, + _ => array, + } + } else { + array + }; + + let part_trim = part_normalization(&part); + + // using IntervalUnit here means we hand off all the work of supporting plurals (like "seconds") + // and synonyms ( like "ms,msec,msecond,millisecond") to Arrow + let mut arr = if let Ok(interval_unit) = IntervalUnit::from_str(part_trim) { + match interval_unit { + IntervalUnit::Year => date_part(array.as_ref(), DatePart::Year)?, + IntervalUnit::Month => date_part(array.as_ref(), DatePart::Month)?, + IntervalUnit::Week => date_part(array.as_ref(), DatePart::Week)?, + IntervalUnit::Day => date_part(array.as_ref(), DatePart::Day)?, + IntervalUnit::Hour => date_part(array.as_ref(), DatePart::Hour)?, + IntervalUnit::Minute => date_part(array.as_ref(), DatePart::Minute)?, + IntervalUnit::Second => seconds_as_i32(array.as_ref(), Second)?, + IntervalUnit::Millisecond => seconds_as_i32(array.as_ref(), Millisecond)?, + IntervalUnit::Microsecond => seconds_as_i32(array.as_ref(), Microsecond)?, + IntervalUnit::Nanosecond => seconds_as_i32(array.as_ref(), Nanosecond)?, + // century and decade are not supported by `DatePart`, although they are supported in postgres + _ => return exec_err!("Date part '{part}' not supported"), + } + } else { + // special cases that can be extracted (in postgres) but are not interval units + match part_trim.to_lowercase().as_str() { + "qtr" | "quarter" => date_part(array.as_ref(), DatePart::Quarter)?, + "doy" => date_part(array.as_ref(), DatePart::DayOfYear)?, + "dow" => date_part(array.as_ref(), DatePart::DayOfWeekSunday0)?, + "isodow" => date_part(array.as_ref(), DatePart::DayOfWeekMonday0)?, + "epoch" => epoch(array.as_ref())?, + _ => return exec_err!("Date part '{part}' not supported"), + } + }; + + // Special adjustment for hour extraction on timezone-aware timestamps + if is_timezone_aware && part_trim.to_lowercase() == "hour" { + if let Some(tz_str) = &tz_str_opt { + let offset_hours = if tz_str.as_ref() == "+00:00" { + 0 + } else { + let sign = if tz_str.starts_with('+') { 1i32 } else { -1i32 }; + let hours_str = &tz_str[1..3]; + let hours: i32 = hours_str.parse().unwrap(); + sign * hours + }; + let int_arr = as_int32_array(&arr)?; + let mut builder = PrimitiveBuilder::::new(); + for i in 0..arr.len() { + if arr.is_null(i) { + builder.append_null(); + } else { + let v = int_arr.value(i); + builder.append_value(v + offset_hours); + } + } + arr = Arc::new(builder.finish()); + } + } + + Ok(if is_scalar { + ColumnarValue::Scalar(ScalarValue::try_from_array(arr.as_ref(), 0)?) + } else { + ColumnarValue::Array(arr) + }) + } + + fn aliases(&self) -> &[String] { + &[] + } + + fn documentation(&self) -> Option<&Documentation> { + self.doc() + } +} + +fn adjust_to_local_time(ts: i64, tz: Tz) -> Result { + fn convert_timestamp(ts: i64, converter: F) -> Result> + where + F: Fn(i64) -> MappedLocalTime>, + { + match converter(ts) { + MappedLocalTime::Ambiguous(earliest, latest) => exec_err!( + "Ambiguous timestamp. Do you mean {:?} or {:?}", + earliest, + latest + ), + MappedLocalTime::None => exec_err!( + "The local time does not exist because there is a gap in the local time." + ), + MappedLocalTime::Single(date_time) => Ok(date_time), + } + } + + let date_time = match T::UNIT { + Nanosecond => Utc.timestamp_nanos(ts), + Microsecond => convert_timestamp(ts, |ts| Utc.timestamp_micros(ts))?, + Millisecond => convert_timestamp(ts, |ts| Utc.timestamp_millis_opt(ts))?, + Second => convert_timestamp(ts, |ts| Utc.timestamp_opt(ts, 0))?, + }; + + let offset_seconds: i64 = tz + .offset_from_utc_datetime(&date_time.naive_utc()) + .fix() + .local_minus_utc() as i64; + + let adjusted_date_time = date_time.add( + TimeDelta::try_seconds(offset_seconds) + .ok_or_else(|| internal_datafusion_err!("Offset seconds should be less than i64::MAX / 1_000 or greater than -i64::MAX / 1_000"))?, + ); + + // convert back to i64 + match T::UNIT { + Nanosecond => adjusted_date_time.timestamp_nanos_opt().ok_or_else(|| { + internal_datafusion_err!( + "Failed to convert DateTime to timestamp in nanosecond. This error may occur if the date is out of range. The supported date ranges are between 1677-09-21T00:12:43.145224192 and 2262-04-11T23:47:16.854775807" + ) + }), + Microsecond => Ok(adjusted_date_time.timestamp_micros()), + Millisecond => Ok(adjusted_date_time.timestamp_millis()), + Second => Ok(adjusted_date_time.timestamp()), + } +} + +fn adjust_timestamp_array( + array: &ArrayRef, + tz: Tz, +) -> Result { + let mut builder = PrimitiveBuilder::::new(); + let primitive_array = as_primitive_array::(array)?; + for ts_opt in primitive_array.iter() { + match ts_opt { + None => builder.append_null(), + Some(ts) => { + let adjusted_ts = adjust_to_local_time::(ts, tz)?; + builder.append_value(adjusted_ts); + } + } + } + Ok(Arc::new(builder.finish())) +} + +fn is_epoch(part: &str) -> bool { + let part = part_normalization(part); + matches!(part.to_lowercase().as_str(), "epoch") +} + +// Try to remove quote if exist, if the quote is invalid, return original string and let the downstream function handle the error +fn part_normalization(part: &str) -> &str { + part.strip_prefix(|c| c == '\'' || c == '\"') + .and_then(|s| s.strip_suffix(|c| c == '\'' || c == '\"')) + .unwrap_or(part) +} + +/// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the +/// result to a total number of seconds, milliseconds, microseconds or +/// nanoseconds +fn seconds_as_i32(array: &dyn Array, unit: TimeUnit) -> Result { + // Nanosecond is neither supported in Postgres nor DuckDB, to avoid dealing + // with overflow and precision issue we don't support nanosecond + if unit == Nanosecond { + return not_impl_err!("Date part {unit:?} not supported"); + } + + let conversion_factor = match unit { + Second => 1_000_000_000, + Millisecond => 1_000_000, + Microsecond => 1_000, + Nanosecond => 1, + }; + + let second_factor = match unit { + Second => 1, + Millisecond => 1_000, + Microsecond => 1_000_000, + Nanosecond => 1_000_000_000, + }; + + let secs = date_part(array, DatePart::Second)?; + // This assumes array is primitive and not a dictionary + let secs = as_int32_array(secs.as_ref())?; + let subsecs = date_part(array, DatePart::Nanosecond)?; + let subsecs = as_int32_array(subsecs.as_ref())?; + + // Special case where there are no nulls. + if subsecs.null_count() == 0 { + let r: Int32Array = binary(secs, subsecs, |secs, subsecs| { + secs * second_factor + (subsecs % 1_000_000_000) / conversion_factor + })?; + Ok(Arc::new(r)) + } else { + // Nulls in secs are preserved, nulls in subsecs are treated as zero to account for the case + // where the number of nanoseconds overflows. + let r: Int32Array = secs + .iter() + .zip(subsecs) + .map(|(secs, subsecs)| { + secs.map(|secs| { + let subsecs = subsecs.unwrap_or(0); + secs * second_factor + (subsecs % 1_000_000_000) / conversion_factor + }) + }) + .collect(); + Ok(Arc::new(r)) + } +} + +/// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the +/// result to a total number of seconds, milliseconds, microseconds or +/// nanoseconds +/// +/// Given epoch return f64, this is a duplicated function to optimize for f64 type +fn seconds(array: &dyn Array, unit: TimeUnit) -> Result { + let sf = match unit { + Second => 1_f64, + Millisecond => 1_000_f64, + Microsecond => 1_000_000_f64, + Nanosecond => 1_000_000_000_f64, + }; + let secs = date_part(array, DatePart::Second)?; + // This assumes array is primitive and not a dictionary + let secs = as_int32_array(secs.as_ref())?; + let subsecs = date_part(array, DatePart::Nanosecond)?; + let subsecs = as_int32_array(subsecs.as_ref())?; + + // Special case where there are no nulls. + if subsecs.null_count() == 0 { + let r: Float64Array = binary(secs, subsecs, |secs, subsecs| { + (secs as f64 + ((subsecs % 1_000_000_000) as f64 / 1_000_000_000_f64)) * sf + })?; + Ok(Arc::new(r)) + } else { + // Nulls in secs are preserved, nulls in subsecs are treated as zero to account for the case + // where the number of nanoseconds overflows. + let r: Float64Array = secs + .iter() + .zip(subsecs) + .map(|(secs, subsecs)| { + secs.map(|secs| { + let subsecs = subsecs.unwrap_or(0); + (secs as f64 + ((subsecs % 1_000_000_000) as f64 / 1_000_000_000_f64)) + * sf + }) + }) + .collect(); + Ok(Arc::new(r)) + } +} + +fn epoch(array: &dyn Array) -> Result { + const SECONDS_IN_A_DAY: f64 = 86400_f64; + + let f: Float64Array = match array.data_type() { + Timestamp(Second, _) => as_timestamp_second_array(array)?.unary(|x| x as f64), + Timestamp(Millisecond, _) => { + as_timestamp_millisecond_array(array)?.unary(|x| x as f64 / 1_000_f64) + } + Timestamp(Microsecond, _) => { + as_timestamp_microsecond_array(array)?.unary(|x| x as f64 / 1_000_000_f64) + } + Timestamp(Nanosecond, _) => { + as_timestamp_nanosecond_array(array)?.unary(|x| x as f64 / 1_000_000_000_f64) + } + Date32 => as_date32_array(array)?.unary(|x| x as f64 * SECONDS_IN_A_DAY), + Date64 => as_date64_array(array)?.unary(|x| x as f64 / 1_000_f64), + Time32(Second) => as_time32_second_array(array)?.unary(|x| x as f64), + Time32(Millisecond) => { + as_time32_millisecond_array(array)?.unary(|x| x as f64 / 1_000_f64) + } + Time64(Microsecond) => { + as_time64_microsecond_array(array)?.unary(|x| x as f64 / 1_000_000_f64) + } + Time64(Nanosecond) => { + as_time64_nanosecond_array(array)?.unary(|x| x as f64 / 1_000_000_000_f64) + } + Interval(_) | Duration(_) => return seconds(array, Second), + d => return exec_err!("Cannot convert {d:?} to epoch"), + }; + Ok(Arc::new(f)) +} From d3043dc7b2c3ad58c198c137490f14977dc008e3 Mon Sep 17 00:00:00 2001 From: sriram Date: Sun, 2 Nov 2025 00:27:17 +0530 Subject: [PATCH 006/157] CI fixes. --- .../functions/src/datetime/date_part.rs | 6 ++-- datafusion/functions/src/datetime/extract.rs | 4 +-- datafusion/sql/src/expr/mod.rs | 2 +- .../test_files/tpch/plans/q7.slt.part | 4 +-- .../test_files/tpch/plans/q8.slt.part | 4 +-- .../test_files/tpch/plans/q9.slt.part | 4 +-- .../source/user-guide/sql/scalar_functions.md | 31 +++++++++++++++++++ 7 files changed, 41 insertions(+), 14 deletions(-) diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index dc9a1d7b5ae1..73efac612374 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -202,7 +202,7 @@ impl ScalarUDFImpl for DatePartFunc { }; let (is_timezone_aware, tz_str_opt) = match array.data_type() { - Timestamp(_, Some(tz_str)) => (true, Some(tz_str.clone())), + Timestamp(_, Some(tz_str)) => (true, Some(Arc::clone(tz_str))), _ => (false, None), }; @@ -226,7 +226,6 @@ impl ScalarUDFImpl for DatePartFunc { adjust_timestamp_array::(&array, tz)? } Second => adjust_timestamp_array::(&array, tz)?, - _ => array, }, _ => array, } @@ -247,7 +246,6 @@ impl ScalarUDFImpl for DatePartFunc { adjust_timestamp_array::(&array, tz)? } Second => adjust_timestamp_array::(&array, tz)?, - _ => array, } } else { array @@ -257,7 +255,7 @@ impl ScalarUDFImpl for DatePartFunc { // using IntervalUnit here means we hand off all the work of supporting plurals (like "seconds") // and synonyms ( like "ms,msec,msecond,millisecond") to Arrow - let mut arr = if let Ok(interval_unit) = IntervalUnit::from_str(part_trim) { + let arr = if let Ok(interval_unit) = IntervalUnit::from_str(part_trim) { match interval_unit { IntervalUnit::Year => date_part(array.as_ref(), DatePart::Year)?, IntervalUnit::Month => date_part(array.as_ref(), DatePart::Month)?, diff --git a/datafusion/functions/src/datetime/extract.rs b/datafusion/functions/src/datetime/extract.rs index ccea202a0b92..bf495e259b7e 100644 --- a/datafusion/functions/src/datetime/extract.rs +++ b/datafusion/functions/src/datetime/extract.rs @@ -198,7 +198,7 @@ impl ScalarUDFImpl for ExtractFunc { }; let (is_timezone_aware, tz_str_opt) = match array.data_type() { - Timestamp(_, Some(tz_str)) => (true, Some(tz_str.clone())), + Timestamp(_, Some(tz_str)) => (true, Some(Arc::clone(tz_str))), _ => (false, None), }; @@ -222,7 +222,6 @@ impl ScalarUDFImpl for ExtractFunc { adjust_timestamp_array::(&array, tz)? } Second => adjust_timestamp_array::(&array, tz)?, - _ => array, }, _ => array, } @@ -243,7 +242,6 @@ impl ScalarUDFImpl for ExtractFunc { adjust_timestamp_array::(&array, tz)? } Second => adjust_timestamp_array::(&array, tz)?, - _ => array, } } else { array diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 5423966bb0b3..a016f28db417 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -1193,7 +1193,7 @@ impl SqlToRel<'_, S> { } fn extract_tz_from_string(s: &str) -> Option { - if let Some(pos) = s.rfind(|c| c == '+' || c == '-') { + if let Some(pos) = s.rfind(|c| ['+', '-'].contains(&c)) { let tz_str = &s[pos..]; if tz_str.len() == 6 && tz_str.chars().nth(3) == Some(':') { Some(tz_str.to_string()) diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part index 291d56e43f2d..12b06bb485fb 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part @@ -62,7 +62,7 @@ logical_plan 02)--Projection: shipping.supp_nation, shipping.cust_nation, shipping.l_year, sum(shipping.volume) AS revenue 03)----Aggregate: groupBy=[[shipping.supp_nation, shipping.cust_nation, shipping.l_year]], aggr=[[sum(shipping.volume)]] 04)------SubqueryAlias: shipping -05)--------Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, date_part(Utf8("YEAR"), lineitem.l_shipdate) AS l_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume +05)--------Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, extract(Utf8("YEAR"), lineitem.l_shipdate) AS l_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume 06)----------Inner Join: customer.c_nationkey = n2.n_nationkey Filter: n1.n_name = Utf8View("FRANCE") AND n2.n_name = Utf8View("GERMANY") OR n1.n_name = Utf8View("GERMANY") AND n2.n_name = Utf8View("FRANCE") 07)------------Projection: lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate, customer.c_nationkey, n1.n_name 08)--------------Inner Join: supplier.s_nationkey = n1.n_nationkey @@ -91,7 +91,7 @@ physical_plan 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([supp_nation@0, cust_nation@1, l_year@2], 4), input_partitions=4 07)------------AggregateExec: mode=Partial, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)] -08)--------------ProjectionExec: expr=[n_name@3 as supp_nation, n_name@4 as cust_nation, date_part(YEAR, l_shipdate@2) as l_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume] +08)--------------ProjectionExec: expr=[n_name@3 as supp_nation, n_name@4 as cust_nation, extract(YEAR, l_shipdate@2) as l_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume] 09)----------------CoalesceBatchesExec: target_batch_size=8192 10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], filter=n_name@0 = FRANCE AND n_name@1 = GERMANY OR n_name@0 = GERMANY AND n_name@1 = FRANCE, projection=[l_extendedprice@0, l_discount@1, l_shipdate@2, n_name@4, n_name@6] 11)--------------------CoalesceBatchesExec: target_batch_size=8192 diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part index 50171c528db6..b10e2ddc9ce1 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part @@ -60,7 +60,7 @@ logical_plan 02)--Projection: all_nations.o_year, CAST(CAST(sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END) AS Decimal128(12, 2)) / CAST(sum(all_nations.volume) AS Decimal128(12, 2)) AS Decimal128(15, 2)) AS mkt_share 03)----Aggregate: groupBy=[[all_nations.o_year]], aggr=[[sum(CASE WHEN all_nations.nation = Utf8View("BRAZIL") THEN all_nations.volume ELSE Decimal128(Some(0),38,4) END) AS sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]] 04)------SubqueryAlias: all_nations -05)--------Projection: date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume, n2.n_name AS nation +05)--------Projection: extract(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) 06)----------Inner Join: n1.n_regionkey = region.r_regionkey 07)------------Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n1.n_regionkey, n2.n_name 08)--------------Inner Join: supplier.s_nationkey = n2.n_nationkey @@ -97,7 +97,7 @@ physical_plan 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([o_year@0], 4), input_partitions=4 07)------------AggregateExec: mode=Partial, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)] -08)--------------ProjectionExec: expr=[date_part(YEAR, o_orderdate@2) as o_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume, n_name@3 as nation] +08)--------------ProjectionExec: expr=[extract(YEAR, o_orderdate@2) as o_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) 09)----------------CoalesceBatchesExec: target_batch_size=8192 10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@3, r_regionkey@0)], projection=[l_extendedprice@0, l_discount@1, o_orderdate@2, n_name@4] 11)--------------------CoalesceBatchesExec: target_batch_size=8192 diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part index 3b31c1bc2e8e..611a05e7371e 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part @@ -56,7 +56,7 @@ logical_plan 02)--Projection: profit.nation, profit.o_year, sum(profit.amount) AS sum_profit 03)----Aggregate: groupBy=[[profit.nation, profit.o_year]], aggr=[[sum(profit.amount)]] 04)------SubqueryAlias: profit -05)--------Projection: nation.n_name AS nation, date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) - partsupp.ps_supplycost * lineitem.l_quantity AS amount +05)--------Projection: nation.n_name AS nation, extract(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) - partsupp.ps_supplycost * lineitem.l_quantity AS amount 06)----------Inner Join: supplier.s_nationkey = nation.n_nationkey 07)------------Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost, orders.o_orderdate 08)--------------Inner Join: lineitem.l_orderkey = orders.o_orderkey @@ -82,7 +82,7 @@ physical_plan 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([nation@0, o_year@1], 4), input_partitions=4 07)------------AggregateExec: mode=Partial, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)] -08)--------------ProjectionExec: expr=[n_name@5 as nation, date_part(YEAR, o_orderdate@4) as o_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) - ps_supplycost@3 * l_quantity@0 as amount] +08)--------------ProjectionExec: expr=[n_name@5 as nation, extract(YEAR, o_orderdate@4) as o_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) - ps_supplycost@3 * l_quantity@0 as amount] 09)----------------CoalesceBatchesExec: target_batch_size=8192 10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[l_quantity@0, l_extendedprice@1, l_discount@2, ps_supplycost@4, o_orderdate@5, n_name@7] 11)--------------------CoalesceBatchesExec: target_batch_size=8192 diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index d2e7066191f9..30e10a84fd8e 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -2387,6 +2387,7 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo - [date_trunc](#date_trunc) - [datepart](#datepart) - [datetrunc](#datetrunc) +- [extract](#extract) - [from_unixtime](#from_unixtime) - [make_date](#make_date) - [now](#now) @@ -2570,6 +2571,36 @@ _Alias of [date_part](#date_part)._ _Alias of [date_trunc](#date_trunc)._ +### `extract` + +Returns the specified part of the date as an integer. + +```sql +extract(field FROM source) +``` + +#### Arguments + +- **field**: Part of the date to return. The following date parts are supported: + +- year +- quarter (emits value in inclusive range [1, 4] based on which quartile of the year the date is in) +- month +- week (week of the year) +- day (day of the month) +- hour +- minute +- second +- millisecond +- microsecond +- nanosecond +- dow (day of the week where Sunday is 0) +- doy (day of the year) +- epoch (seconds since Unix epoch) +- isodow (day of the week where Monday is 0) + +- **source**: Time expression to operate on. Can be a constant, column, or function. + ### `from_unixtime` Converts an integer to RFC3339 timestamp format (`YYYY-MM-DDT00:00:00.000000000Z`). Integers and unsigned integers are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`) return the corresponding timestamp. From 5f2f5d7b79a3e8a773f591d4e27fc19227b4f226 Mon Sep 17 00:00:00 2001 From: sriram Date: Sun, 2 Nov 2025 00:55:40 +0530 Subject: [PATCH 007/157] CI fixes. --- datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part index b10e2ddc9ce1..a500f89f5f4b 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part @@ -60,7 +60,7 @@ logical_plan 02)--Projection: all_nations.o_year, CAST(CAST(sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END) AS Decimal128(12, 2)) / CAST(sum(all_nations.volume) AS Decimal128(12, 2)) AS Decimal128(15, 2)) AS mkt_share 03)----Aggregate: groupBy=[[all_nations.o_year]], aggr=[[sum(CASE WHEN all_nations.nation = Utf8View("BRAZIL") THEN all_nations.volume ELSE Decimal128(Some(0),38,4) END) AS sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]] 04)------SubqueryAlias: all_nations -05)--------Projection: extract(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) +05)--------Projection: extract(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume, n2.n_name AS nation 06)----------Inner Join: n1.n_regionkey = region.r_regionkey 07)------------Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n1.n_regionkey, n2.n_name 08)--------------Inner Join: supplier.s_nationkey = n2.n_nationkey @@ -97,7 +97,7 @@ physical_plan 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([o_year@0], 4), input_partitions=4 07)------------AggregateExec: mode=Partial, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)] -08)--------------ProjectionExec: expr=[extract(YEAR, o_orderdate@2) as o_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) +08)--------------ProjectionExec: expr=[extract(YEAR, o_orderdate@2) as o_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume, n_name@3 as nation] 09)----------------CoalesceBatchesExec: target_batch_size=8192 10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_regionkey@3, r_regionkey@0)], projection=[l_extendedprice@0, l_discount@1, o_orderdate@2, n_name@4] 11)--------------------CoalesceBatchesExec: target_batch_size=8192 From be7385c0747ebe7cdad5de353cbab18d48e21994 Mon Sep 17 00:00:00 2001 From: sriram Date: Mon, 10 Nov 2025 16:13:03 +0530 Subject: [PATCH 008/157] Make extract timezone aware and part of the date part udf itself --- .../functions/src/datetime/date_part.rs | 211 ++++--- datafusion/functions/src/datetime/extract.rs | 525 ------------------ datafusion/functions/src/datetime/mod.rs | 59 +- datafusion/functions/src/datetime/planner.rs | 2 +- .../functions/src/datetime/to_local_time.rs | 56 +- .../sqllogictest/test_files/extract_tz.slt | 19 + .../sqllogictest/test_files/group_by.slt | 12 +- .../optimizer_group_by_constant.slt | 2 +- .../test_files/tpch/plans/q7.slt.part | 4 +- .../test_files/tpch/plans/q8.slt.part | 2 +- .../test_files/tpch/plans/q9.slt.part | 4 +- 11 files changed, 231 insertions(+), 665 deletions(-) delete mode 100644 datafusion/functions/src/datetime/extract.rs diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 73efac612374..4a387a0d1641 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -31,11 +31,11 @@ use arrow::datatypes::{ ArrowTimestampType, DataType, Field, FieldRef, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, }; -use chrono::{DateTime, MappedLocalTime, Offset, TimeDelta, TimeZone, Utc}; + use datafusion_common::cast::as_primitive_array; use datafusion_common::types::{logical_date, NativeType}; -use std::ops::Add; +use super::adjust_to_local_time; use datafusion_common::{ cast::{ as_date32_array, as_date64_array, as_int32_array, as_time32_millisecond_array, @@ -43,7 +43,7 @@ use datafusion_common::{ as_timestamp_microsecond_array, as_timestamp_millisecond_array, as_timestamp_nanosecond_array, as_timestamp_second_array, }, - exec_err, internal_datafusion_err, internal_err, not_impl_err, + exec_err, internal_err, not_impl_err, types::logical_string, utils::take_function_args, Result, ScalarValue, @@ -131,7 +131,7 @@ impl DatePartFunc { ], Volatility::Immutable, ), - aliases: vec![String::from("datepart")], + aliases: vec![String::from("datepart"), String::from("extract")], } } } @@ -206,8 +206,13 @@ impl ScalarUDFImpl for DatePartFunc { _ => (false, None), }; - // Adjust timestamps for extraction - let array = if is_timezone_aware { + let part_trim = part_normalization(&part); + let is_epoch = is_epoch(&part); + + // Epoch is timezone-independent - it always returns seconds since 1970-01-01 UTC + let array = if is_epoch { + array + } else if is_timezone_aware { // For timezone-aware timestamps, extract in their own timezone let tz_str = tz_str_opt.as_ref().unwrap(); let tz = match tz_str.parse::() { @@ -251,12 +256,10 @@ impl ScalarUDFImpl for DatePartFunc { array }; - let part_trim = part_normalization(&part); - // using IntervalUnit here means we hand off all the work of supporting plurals (like "seconds") // and synonyms ( like "ms,msec,msecond,millisecond") to Arrow let arr = if let Ok(interval_unit) = IntervalUnit::from_str(part_trim) { - match interval_unit { + let extracted = match interval_unit { IntervalUnit::Year => date_part(array.as_ref(), DatePart::Year)?, IntervalUnit::Month => date_part(array.as_ref(), DatePart::Month)?, IntervalUnit::Week => date_part(array.as_ref(), DatePart::Week)?, @@ -267,8 +270,39 @@ impl ScalarUDFImpl for DatePartFunc { IntervalUnit::Millisecond => seconds_as_i32(array.as_ref(), Millisecond)?, IntervalUnit::Microsecond => seconds_as_i32(array.as_ref(), Microsecond)?, IntervalUnit::Nanosecond => seconds_as_i32(array.as_ref(), Nanosecond)?, - // century and decade are not supported by `DatePart`, although they are supported in postgres _ => return exec_err!("Date part '{part}' not supported"), + }; + + // For fixed offsets (like +04:00, -05:30), apply the offset to extract values. + // Named timezones (like 'America/New_York') are handled by adjust_to_local_time + // and DST is already applied via chrono. + if is_timezone_aware { + let tz_str = tz_str_opt.as_ref().unwrap().as_ref(); + if is_fixed_offset(tz_str) { + if let Some(offset_info) = extract_offset_components(tz_str) { + match interval_unit { + IntervalUnit::Hour => apply_hour_offset( + extracted.as_ref(), + offset_info.hours, + offset_info.minutes, + )?, + IntervalUnit::Minute => apply_minute_offset( + extracted.as_ref(), + offset_info.minutes, + )?, + IntervalUnit::Day => { + apply_day_offset(extracted.as_ref(), offset_info.hours)? + } + _ => extracted, + } + } else { + extracted + } + } else { + extracted + } + } else { + extracted } } else { // special cases that can be extracted (in postgres) but are not interval units @@ -298,54 +332,6 @@ impl ScalarUDFImpl for DatePartFunc { } } -fn adjust_to_local_time(ts: i64, tz: Tz) -> Result { - fn convert_timestamp(ts: i64, converter: F) -> Result> - where - F: Fn(i64) -> MappedLocalTime>, - { - match converter(ts) { - MappedLocalTime::Ambiguous(earliest, latest) => exec_err!( - "Ambiguous timestamp. Do you mean {:?} or {:?}", - earliest, - latest - ), - MappedLocalTime::None => exec_err!( - "The local time does not exist because there is a gap in the local time." - ), - MappedLocalTime::Single(date_time) => Ok(date_time), - } - } - - let date_time = match T::UNIT { - Nanosecond => Utc.timestamp_nanos(ts), - Microsecond => convert_timestamp(ts, |ts| Utc.timestamp_micros(ts))?, - Millisecond => convert_timestamp(ts, |ts| Utc.timestamp_millis_opt(ts))?, - Second => convert_timestamp(ts, |ts| Utc.timestamp_opt(ts, 0))?, - }; - - let offset_seconds: i64 = tz - .offset_from_utc_datetime(&date_time.naive_utc()) - .fix() - .local_minus_utc() as i64; - - let adjusted_date_time = date_time.add( - TimeDelta::try_seconds(offset_seconds) - .ok_or_else(|| internal_datafusion_err!("Offset seconds should be less than i64::MAX / 1_000 or greater than -i64::MAX / 1_000"))?, - ); - - // convert back to i64 - match T::UNIT { - Nanosecond => adjusted_date_time.timestamp_nanos_opt().ok_or_else(|| { - internal_datafusion_err!( - "Failed to convert DateTime to timestamp in nanosecond. This error may occur if the date is out of range. The supported date ranges are between 1677-09-21T00:12:43.145224192 and 2262-04-11T23:47:16.854775807" - ) - }), - Microsecond => Ok(adjusted_date_time.timestamp_micros()), - Millisecond => Ok(adjusted_date_time.timestamp_millis()), - Second => Ok(adjusted_date_time.timestamp()), - } -} - fn adjust_timestamp_array( array: &ArrayRef, tz: Tz, @@ -369,18 +355,108 @@ fn is_epoch(part: &str) -> bool { matches!(part.to_lowercase().as_str(), "epoch") } -// Try to remove quote if exist, if the quote is invalid, return original string and let the downstream function handle the error +// Check if a timezone string is a fixed offset +fn is_fixed_offset(tz_str: &str) -> bool { + tz_str.starts_with('+') || tz_str.starts_with('-') +} + +// Holds the components of a timezone offset (hours and minutes). +struct OffsetInfo { + hours: i32, + minutes: i32, +} + +// Extracts the offset components from a timezone string like "+04:00" or "-05:30". +fn extract_offset_components(tz_str: &str) -> Option { + if tz_str.len() < 6 { + return None; + } + + let sign = match &tz_str[0..1] { + "+" => 1, + "-" => -1, + _ => return None, + }; + + let hours: i32 = tz_str[1..3].parse().ok()?; + let minutes: i32 = tz_str[4..6].parse().ok()?; + + Some(OffsetInfo { + hours: sign * hours, + minutes: sign * minutes, + }) +} + +// Applies the timezone offset to hour values in an array. +fn apply_hour_offset( + array: &dyn Array, + offset_hours: i32, + offset_minutes: i32, +) -> Result { + let hour_array = as_int32_array(array)?; + let result: Int32Array = hour_array + .iter() + .map(|hour| { + hour.map(|h| { + let mut adjusted = h + offset_hours; + if offset_minutes.abs() >= 30 { + adjusted += if offset_minutes > 0 { 1 } else { -1 }; + } + ((adjusted % 24) + 24) % 24 + }) + }) + .collect(); + Ok(Arc::new(result)) +} + +// Applies the timezone offset to minute values in an array. +fn apply_minute_offset(array: &dyn Array, offset_minutes: i32) -> Result { + let minute_array = as_int32_array(array)?; + let result: Int32Array = minute_array + .iter() + .map(|minute| { + minute.map(|m| { + let adjusted = m + offset_minutes; + ((adjusted % 60) + 60) % 60 + }) + }) + .collect(); + Ok(Arc::new(result)) +} + +// Applies the timezone offset to day values in an array. +fn apply_day_offset(array: &dyn Array, offset_hours: i32) -> Result { + let day_array = as_int32_array(array)?; + let result: Int32Array = day_array + .iter() + .map(|day| { + day.map(|d| { + if offset_hours >= 24 { + d + (offset_hours / 24) + } else if offset_hours <= -24 { + d + (offset_hours / 24) + } else if offset_hours > 0 { + d + 1 + } else if offset_hours < 0 { + d - 1 + } else { + d + } + }) + }) + .collect(); + Ok(Arc::new(result)) +} + +// Try to remove quotes if they exist. If the quotes are invalid, return original string. fn part_normalization(part: &str) -> &str { part.strip_prefix(|c| c == '\'' || c == '\"') .and_then(|s| s.strip_suffix(|c| c == '\'' || c == '\"')) .unwrap_or(part) } -/// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the -/// result to a total number of seconds, milliseconds, microseconds or -/// nanoseconds +// Converts seconds to i32 with the specified time unit. fn seconds_as_i32(array: &dyn Array, unit: TimeUnit) -> Result { - // Nanosecond is neither supported in Postgres nor DuckDB, to avoid dealing // with overflow and precision issue we don't support nanosecond if unit == Nanosecond { return not_impl_err!("Date part {unit:?} not supported"); @@ -401,7 +477,6 @@ fn seconds_as_i32(array: &dyn Array, unit: TimeUnit) -> Result { }; let secs = date_part(array, DatePart::Second)?; - // This assumes array is primitive and not a dictionary let secs = as_int32_array(secs.as_ref())?; let subsecs = date_part(array, DatePart::Nanosecond)?; let subsecs = as_int32_array(subsecs.as_ref())?; @@ -429,11 +504,8 @@ fn seconds_as_i32(array: &dyn Array, unit: TimeUnit) -> Result { } } -/// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the -/// result to a total number of seconds, milliseconds, microseconds or -/// nanoseconds -/// -/// Given epoch return f64, this is a duplicated function to optimize for f64 type +// Converts seconds to f64 with the specified time unit. +// Used for Interval and Duration types that need floating-point precision. fn seconds(array: &dyn Array, unit: TimeUnit) -> Result { let sf = match unit { Second => 1_f64, @@ -442,7 +514,6 @@ fn seconds(array: &dyn Array, unit: TimeUnit) -> Result { Nanosecond => 1_000_000_000_f64, }; let secs = date_part(array, DatePart::Second)?; - // This assumes array is primitive and not a dictionary let secs = as_int32_array(secs.as_ref())?; let subsecs = date_part(array, DatePart::Nanosecond)?; let subsecs = as_int32_array(subsecs.as_ref())?; diff --git a/datafusion/functions/src/datetime/extract.rs b/datafusion/functions/src/datetime/extract.rs deleted file mode 100644 index bf495e259b7e..000000000000 --- a/datafusion/functions/src/datetime/extract.rs +++ /dev/null @@ -1,525 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::any::Any; -use std::str::FromStr; -use std::sync::Arc; - -use arrow::array::timezone::Tz; -use arrow::array::{Array, ArrayRef, Float64Array, Int32Array, PrimitiveBuilder}; -use arrow::compute::kernels::cast_utils::IntervalUnit; -use arrow::compute::{binary, date_part, DatePart}; -use arrow::datatypes::DataType::{ - Date32, Date64, Duration, Interval, Time32, Time64, Timestamp, -}; -use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; -use arrow::datatypes::{ - ArrowTimestampType, DataType, Field, FieldRef, Int32Type, TimeUnit, - TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, - TimestampSecondType, -}; -use chrono::{DateTime, MappedLocalTime, Offset, TimeDelta, TimeZone, Utc}; -use datafusion_common::cast::as_primitive_array; -use datafusion_common::types::{logical_date, NativeType}; -use std::ops::Add; - -use datafusion_common::{ - cast::{ - as_date32_array, as_date64_array, as_int32_array, as_time32_millisecond_array, - as_time32_second_array, as_time64_microsecond_array, as_time64_nanosecond_array, - as_timestamp_microsecond_array, as_timestamp_millisecond_array, - as_timestamp_nanosecond_array, as_timestamp_second_array, - }, - exec_err, internal_datafusion_err, internal_err, not_impl_err, - types::logical_string, - utils::take_function_args, - Result, ScalarValue, -}; -use datafusion_expr::{ - ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, - TypeSignature, Volatility, -}; -use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; -use datafusion_macros::user_doc; - -#[user_doc( - doc_section(label = "Time and Date Functions"), - description = "Returns the specified part of the date as an integer.", - syntax_example = "extract(field FROM source)", - argument( - name = "field", - description = r#"Part of the date to return. The following date parts are supported: - -- year -- quarter (emits value in inclusive range [1, 4] based on which quartile of the year the date is in) -- month -- week (week of the year) -- day (day of the month) -- hour -- minute -- second -- millisecond -- microsecond -- nanosecond -- dow (day of the week where Sunday is 0) -- doy (day of the year) -- epoch (seconds since Unix epoch) -- isodow (day of the week where Monday is 0) -"# - ), - argument( - name = "source", - description = "Time expression to operate on. Can be a constant, column, or function." - ) -)] -#[derive(Debug, PartialEq, Eq, Hash)] -pub struct ExtractFunc { - signature: Signature, -} - -impl Default for ExtractFunc { - fn default() -> Self { - Self::new() - } -} - -impl ExtractFunc { - pub fn new() -> Self { - Self { - signature: Signature::one_of( - vec![ - TypeSignature::Coercible(vec![ - Coercion::new_exact(TypeSignatureClass::Native(logical_string())), - Coercion::new_implicit( - TypeSignatureClass::Timestamp, - // Not consistent with Postgres and DuckDB but to avoid regression we implicit cast string to timestamp - vec![TypeSignatureClass::Native(logical_string())], - NativeType::Timestamp(Nanosecond, None), - ), - ]), - TypeSignature::Coercible(vec![ - Coercion::new_exact(TypeSignatureClass::Native(logical_string())), - Coercion::new_exact(TypeSignatureClass::Native(logical_date())), - ]), - TypeSignature::Coercible(vec![ - Coercion::new_exact(TypeSignatureClass::Native(logical_string())), - Coercion::new_exact(TypeSignatureClass::Time), - ]), - TypeSignature::Coercible(vec![ - Coercion::new_exact(TypeSignatureClass::Native(logical_string())), - Coercion::new_exact(TypeSignatureClass::Interval), - ]), - TypeSignature::Coercible(vec![ - Coercion::new_exact(TypeSignatureClass::Native(logical_string())), - Coercion::new_exact(TypeSignatureClass::Duration), - ]), - ], - Volatility::Immutable, - ), - } - } -} - -impl ScalarUDFImpl for ExtractFunc { - fn as_any(&self) -> &dyn Any { - self - } - - fn name(&self) -> &str { - "extract" - } - - fn signature(&self) -> &Signature { - &self.signature - } - - fn return_type(&self, _arg_types: &[DataType]) -> Result { - internal_err!("return_field_from_args should be called instead") - } - - fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { - let [field, _] = take_function_args(self.name(), args.scalar_arguments)?; - - field - .and_then(|sv| { - sv.try_as_str() - .flatten() - .filter(|s| !s.is_empty()) - .map(|part| { - if is_epoch(part) { - Field::new(self.name(), DataType::Float64, true) - } else { - Field::new(self.name(), DataType::Int32, true) - } - }) - }) - .map(Arc::new) - .map_or_else( - || exec_err!("{} requires non-empty constant string", self.name()), - Ok, - ) - } - - fn invoke_with_args( - &self, - args: datafusion_expr::ScalarFunctionArgs, - ) -> Result { - let config = &args.config_options; - let args = args.args; - let [part, array] = take_function_args(self.name(), args)?; - - let part = if let ColumnarValue::Scalar(ScalarValue::Utf8(Some(v))) = part { - v - } else if let ColumnarValue::Scalar(ScalarValue::Utf8View(Some(v))) = part { - v - } else { - return exec_err!("First argument of `EXTRACT` must be non-null scalar Utf8"); - }; - - let is_scalar = matches!(array, ColumnarValue::Scalar(_)); - - let array = match array { - ColumnarValue::Array(array) => Arc::clone(&array), - ColumnarValue::Scalar(scalar) => scalar.to_array()?, - }; - - let (is_timezone_aware, tz_str_opt) = match array.data_type() { - Timestamp(_, Some(tz_str)) => (true, Some(Arc::clone(tz_str))), - _ => (false, None), - }; - - // Adjust timestamps for extraction - let array = if is_timezone_aware { - // For timezone-aware timestamps, extract in their own timezone - let tz_str = tz_str_opt.as_ref().unwrap(); - let tz = match tz_str.parse::() { - Ok(tz) => tz, - Err(_) => return exec_err!("Invalid timezone"), - }; - match array.data_type() { - Timestamp(time_unit, _) => match time_unit { - Nanosecond => { - adjust_timestamp_array::(&array, tz)? - } - Microsecond => { - adjust_timestamp_array::(&array, tz)? - } - Millisecond => { - adjust_timestamp_array::(&array, tz)? - } - Second => adjust_timestamp_array::(&array, tz)?, - }, - _ => array, - } - } else if let Timestamp(time_unit, None) = array.data_type() { - // For naive timestamps, interpret in session timezone - let tz = match config.execution.time_zone.parse::() { - Ok(tz) => tz, - Err(_) => return exec_err!("Invalid timezone"), - }; - match time_unit { - Nanosecond => { - adjust_timestamp_array::(&array, tz)? - } - Microsecond => { - adjust_timestamp_array::(&array, tz)? - } - Millisecond => { - adjust_timestamp_array::(&array, tz)? - } - Second => adjust_timestamp_array::(&array, tz)?, - } - } else { - array - }; - - let part_trim = part_normalization(&part); - - // using IntervalUnit here means we hand off all the work of supporting plurals (like "seconds") - // and synonyms ( like "ms,msec,msecond,millisecond") to Arrow - let mut arr = if let Ok(interval_unit) = IntervalUnit::from_str(part_trim) { - match interval_unit { - IntervalUnit::Year => date_part(array.as_ref(), DatePart::Year)?, - IntervalUnit::Month => date_part(array.as_ref(), DatePart::Month)?, - IntervalUnit::Week => date_part(array.as_ref(), DatePart::Week)?, - IntervalUnit::Day => date_part(array.as_ref(), DatePart::Day)?, - IntervalUnit::Hour => date_part(array.as_ref(), DatePart::Hour)?, - IntervalUnit::Minute => date_part(array.as_ref(), DatePart::Minute)?, - IntervalUnit::Second => seconds_as_i32(array.as_ref(), Second)?, - IntervalUnit::Millisecond => seconds_as_i32(array.as_ref(), Millisecond)?, - IntervalUnit::Microsecond => seconds_as_i32(array.as_ref(), Microsecond)?, - IntervalUnit::Nanosecond => seconds_as_i32(array.as_ref(), Nanosecond)?, - // century and decade are not supported by `DatePart`, although they are supported in postgres - _ => return exec_err!("Date part '{part}' not supported"), - } - } else { - // special cases that can be extracted (in postgres) but are not interval units - match part_trim.to_lowercase().as_str() { - "qtr" | "quarter" => date_part(array.as_ref(), DatePart::Quarter)?, - "doy" => date_part(array.as_ref(), DatePart::DayOfYear)?, - "dow" => date_part(array.as_ref(), DatePart::DayOfWeekSunday0)?, - "isodow" => date_part(array.as_ref(), DatePart::DayOfWeekMonday0)?, - "epoch" => epoch(array.as_ref())?, - _ => return exec_err!("Date part '{part}' not supported"), - } - }; - - // Special adjustment for hour extraction on timezone-aware timestamps - if is_timezone_aware && part_trim.to_lowercase() == "hour" { - if let Some(tz_str) = &tz_str_opt { - let offset_hours = if tz_str.as_ref() == "+00:00" { - 0 - } else { - let sign = if tz_str.starts_with('+') { 1i32 } else { -1i32 }; - let hours_str = &tz_str[1..3]; - let hours: i32 = hours_str.parse().unwrap(); - sign * hours - }; - let int_arr = as_int32_array(&arr)?; - let mut builder = PrimitiveBuilder::::new(); - for i in 0..arr.len() { - if arr.is_null(i) { - builder.append_null(); - } else { - let v = int_arr.value(i); - builder.append_value(v + offset_hours); - } - } - arr = Arc::new(builder.finish()); - } - } - - Ok(if is_scalar { - ColumnarValue::Scalar(ScalarValue::try_from_array(arr.as_ref(), 0)?) - } else { - ColumnarValue::Array(arr) - }) - } - - fn aliases(&self) -> &[String] { - &[] - } - - fn documentation(&self) -> Option<&Documentation> { - self.doc() - } -} - -fn adjust_to_local_time(ts: i64, tz: Tz) -> Result { - fn convert_timestamp(ts: i64, converter: F) -> Result> - where - F: Fn(i64) -> MappedLocalTime>, - { - match converter(ts) { - MappedLocalTime::Ambiguous(earliest, latest) => exec_err!( - "Ambiguous timestamp. Do you mean {:?} or {:?}", - earliest, - latest - ), - MappedLocalTime::None => exec_err!( - "The local time does not exist because there is a gap in the local time." - ), - MappedLocalTime::Single(date_time) => Ok(date_time), - } - } - - let date_time = match T::UNIT { - Nanosecond => Utc.timestamp_nanos(ts), - Microsecond => convert_timestamp(ts, |ts| Utc.timestamp_micros(ts))?, - Millisecond => convert_timestamp(ts, |ts| Utc.timestamp_millis_opt(ts))?, - Second => convert_timestamp(ts, |ts| Utc.timestamp_opt(ts, 0))?, - }; - - let offset_seconds: i64 = tz - .offset_from_utc_datetime(&date_time.naive_utc()) - .fix() - .local_minus_utc() as i64; - - let adjusted_date_time = date_time.add( - TimeDelta::try_seconds(offset_seconds) - .ok_or_else(|| internal_datafusion_err!("Offset seconds should be less than i64::MAX / 1_000 or greater than -i64::MAX / 1_000"))?, - ); - - // convert back to i64 - match T::UNIT { - Nanosecond => adjusted_date_time.timestamp_nanos_opt().ok_or_else(|| { - internal_datafusion_err!( - "Failed to convert DateTime to timestamp in nanosecond. This error may occur if the date is out of range. The supported date ranges are between 1677-09-21T00:12:43.145224192 and 2262-04-11T23:47:16.854775807" - ) - }), - Microsecond => Ok(adjusted_date_time.timestamp_micros()), - Millisecond => Ok(adjusted_date_time.timestamp_millis()), - Second => Ok(adjusted_date_time.timestamp()), - } -} - -fn adjust_timestamp_array( - array: &ArrayRef, - tz: Tz, -) -> Result { - let mut builder = PrimitiveBuilder::::new(); - let primitive_array = as_primitive_array::(array)?; - for ts_opt in primitive_array.iter() { - match ts_opt { - None => builder.append_null(), - Some(ts) => { - let adjusted_ts = adjust_to_local_time::(ts, tz)?; - builder.append_value(adjusted_ts); - } - } - } - Ok(Arc::new(builder.finish())) -} - -fn is_epoch(part: &str) -> bool { - let part = part_normalization(part); - matches!(part.to_lowercase().as_str(), "epoch") -} - -// Try to remove quote if exist, if the quote is invalid, return original string and let the downstream function handle the error -fn part_normalization(part: &str) -> &str { - part.strip_prefix(|c| c == '\'' || c == '\"') - .and_then(|s| s.strip_suffix(|c| c == '\'' || c == '\"')) - .unwrap_or(part) -} - -/// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the -/// result to a total number of seconds, milliseconds, microseconds or -/// nanoseconds -fn seconds_as_i32(array: &dyn Array, unit: TimeUnit) -> Result { - // Nanosecond is neither supported in Postgres nor DuckDB, to avoid dealing - // with overflow and precision issue we don't support nanosecond - if unit == Nanosecond { - return not_impl_err!("Date part {unit:?} not supported"); - } - - let conversion_factor = match unit { - Second => 1_000_000_000, - Millisecond => 1_000_000, - Microsecond => 1_000, - Nanosecond => 1, - }; - - let second_factor = match unit { - Second => 1, - Millisecond => 1_000, - Microsecond => 1_000_000, - Nanosecond => 1_000_000_000, - }; - - let secs = date_part(array, DatePart::Second)?; - // This assumes array is primitive and not a dictionary - let secs = as_int32_array(secs.as_ref())?; - let subsecs = date_part(array, DatePart::Nanosecond)?; - let subsecs = as_int32_array(subsecs.as_ref())?; - - // Special case where there are no nulls. - if subsecs.null_count() == 0 { - let r: Int32Array = binary(secs, subsecs, |secs, subsecs| { - secs * second_factor + (subsecs % 1_000_000_000) / conversion_factor - })?; - Ok(Arc::new(r)) - } else { - // Nulls in secs are preserved, nulls in subsecs are treated as zero to account for the case - // where the number of nanoseconds overflows. - let r: Int32Array = secs - .iter() - .zip(subsecs) - .map(|(secs, subsecs)| { - secs.map(|secs| { - let subsecs = subsecs.unwrap_or(0); - secs * second_factor + (subsecs % 1_000_000_000) / conversion_factor - }) - }) - .collect(); - Ok(Arc::new(r)) - } -} - -/// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the -/// result to a total number of seconds, milliseconds, microseconds or -/// nanoseconds -/// -/// Given epoch return f64, this is a duplicated function to optimize for f64 type -fn seconds(array: &dyn Array, unit: TimeUnit) -> Result { - let sf = match unit { - Second => 1_f64, - Millisecond => 1_000_f64, - Microsecond => 1_000_000_f64, - Nanosecond => 1_000_000_000_f64, - }; - let secs = date_part(array, DatePart::Second)?; - // This assumes array is primitive and not a dictionary - let secs = as_int32_array(secs.as_ref())?; - let subsecs = date_part(array, DatePart::Nanosecond)?; - let subsecs = as_int32_array(subsecs.as_ref())?; - - // Special case where there are no nulls. - if subsecs.null_count() == 0 { - let r: Float64Array = binary(secs, subsecs, |secs, subsecs| { - (secs as f64 + ((subsecs % 1_000_000_000) as f64 / 1_000_000_000_f64)) * sf - })?; - Ok(Arc::new(r)) - } else { - // Nulls in secs are preserved, nulls in subsecs are treated as zero to account for the case - // where the number of nanoseconds overflows. - let r: Float64Array = secs - .iter() - .zip(subsecs) - .map(|(secs, subsecs)| { - secs.map(|secs| { - let subsecs = subsecs.unwrap_or(0); - (secs as f64 + ((subsecs % 1_000_000_000) as f64 / 1_000_000_000_f64)) - * sf - }) - }) - .collect(); - Ok(Arc::new(r)) - } -} - -fn epoch(array: &dyn Array) -> Result { - const SECONDS_IN_A_DAY: f64 = 86400_f64; - - let f: Float64Array = match array.data_type() { - Timestamp(Second, _) => as_timestamp_second_array(array)?.unary(|x| x as f64), - Timestamp(Millisecond, _) => { - as_timestamp_millisecond_array(array)?.unary(|x| x as f64 / 1_000_f64) - } - Timestamp(Microsecond, _) => { - as_timestamp_microsecond_array(array)?.unary(|x| x as f64 / 1_000_000_f64) - } - Timestamp(Nanosecond, _) => { - as_timestamp_nanosecond_array(array)?.unary(|x| x as f64 / 1_000_000_000_f64) - } - Date32 => as_date32_array(array)?.unary(|x| x as f64 * SECONDS_IN_A_DAY), - Date64 => as_date64_array(array)?.unary(|x| x as f64 / 1_000_f64), - Time32(Second) => as_time32_second_array(array)?.unary(|x| x as f64), - Time32(Millisecond) => { - as_time32_millisecond_array(array)?.unary(|x| x as f64 / 1_000_f64) - } - Time64(Microsecond) => { - as_time64_microsecond_array(array)?.unary(|x| x as f64 / 1_000_000_f64) - } - Time64(Nanosecond) => { - as_time64_nanosecond_array(array)?.unary(|x| x as f64 / 1_000_000_000_f64) - } - Interval(_) | Duration(_) => return seconds(array, Second), - d => return exec_err!("Cannot convert {d:?} to epoch"), - }; - Ok(Arc::new(f)) -} diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index a842b6d7a9d5..60d399e90565 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -19,6 +19,13 @@ use std::sync::Arc; +use arrow::array::timezone::Tz; +use arrow::datatypes::ArrowTimestampType; +use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; +use chrono::{DateTime, MappedLocalTime, Offset, TimeDelta, TimeZone, Utc}; +use datafusion_common::{exec_err, internal_datafusion_err, Result}; +use std::ops::Add; + use datafusion_expr::ScalarUDF; pub mod common; @@ -27,7 +34,6 @@ pub mod current_time; pub mod date_bin; pub mod date_part; pub mod date_trunc; -pub mod extract; pub mod from_unixtime; pub mod make_date; pub mod now; @@ -38,13 +44,61 @@ pub mod to_local_time; pub mod to_timestamp; pub mod to_unixtime; +// Adjusts a timestamp to local time by applying the timezone offset. +pub fn adjust_to_local_time(ts: i64, tz: Tz) -> Result { + fn convert_timestamp(ts: i64, converter: F) -> Result> + where + F: Fn(i64) -> MappedLocalTime>, + { + match converter(ts) { + MappedLocalTime::Ambiguous(earliest, latest) => exec_err!( + "Ambiguous timestamp. Do you mean {:?} or {:?}", + earliest, + latest + ), + MappedLocalTime::None => exec_err!( + "The local time does not exist because there is a gap in the local time." + ), + MappedLocalTime::Single(date_time) => Ok(date_time), + } + } + + let date_time = match T::UNIT { + Nanosecond => Utc.timestamp_nanos(ts), + Microsecond => convert_timestamp(ts, |ts| Utc.timestamp_micros(ts))?, + Millisecond => convert_timestamp(ts, |ts| Utc.timestamp_millis_opt(ts))?, + Second => convert_timestamp(ts, |ts| Utc.timestamp_opt(ts, 0))?, + }; + + let offset_seconds: i64 = tz + .offset_from_utc_datetime(&date_time.naive_utc()) + .fix() + .local_minus_utc() as i64; + + let adjusted_date_time = date_time.add( + TimeDelta::try_seconds(offset_seconds) + .ok_or_else(|| internal_datafusion_err!("Offset seconds should be less than i64::MAX / 1_000 or greater than -i64::MAX / 1_000"))?, + ); + + // convert back to i64 + match T::UNIT { + Nanosecond => adjusted_date_time.timestamp_nanos_opt().ok_or_else(|| { + internal_datafusion_err!( + "Failed to convert DateTime to timestamp in nanosecond. This error may occur if the date is out of range. The supported date ranges are between 1677-09-21T00:12:43.145224192 and 2262-04-11T23:47:16.854775807" + ) + }), + Microsecond => Ok(adjusted_date_time.timestamp_micros()), + Millisecond => Ok(adjusted_date_time.timestamp_millis()), + Second => Ok(adjusted_date_time.timestamp()), + } +} + // create UDFs make_udf_function!(current_date::CurrentDateFunc, current_date); make_udf_function!(current_time::CurrentTimeFunc, current_time); make_udf_function!(date_bin::DateBinFunc, date_bin); make_udf_function!(date_part::DatePartFunc, date_part); make_udf_function!(date_trunc::DateTruncFunc, date_trunc); -make_udf_function!(extract::ExtractFunc, extract); make_udf_function!(make_date::MakeDateFunc, make_date); make_udf_function!(from_unixtime::FromUnixtimeFunc, from_unixtime); make_udf_function!(to_char::ToCharFunc, to_char); @@ -267,7 +321,6 @@ pub fn functions() -> Vec> { date_bin(), date_part(), date_trunc(), - extract(), from_unixtime(), make_date(), now(&ConfigOptions::default()), diff --git a/datafusion/functions/src/datetime/planner.rs b/datafusion/functions/src/datetime/planner.rs index 20442d0205a2..f4b64c3711e2 100644 --- a/datafusion/functions/src/datetime/planner.rs +++ b/datafusion/functions/src/datetime/planner.rs @@ -29,7 +29,7 @@ impl ExprPlanner for DatetimeFunctionPlanner { args: Vec, ) -> datafusion_common::Result>> { Ok(PlannerResult::Planned(Expr::ScalarFunction( - ScalarFunction::new_udf(crate::datetime::extract(), args), + ScalarFunction::new_udf(crate::datetime::date_part(), args), ))) } } diff --git a/datafusion/functions/src/datetime/to_local_time.rs b/datafusion/functions/src/datetime/to_local_time.rs index a2a54398a33b..ccdb45c9b05f 100644 --- a/datafusion/functions/src/datetime/to_local_time.rs +++ b/datafusion/functions/src/datetime/to_local_time.rs @@ -16,7 +16,6 @@ // under the License. use std::any::Any; -use std::ops::Add; use std::sync::Arc; use arrow::array::timezone::Tz; @@ -27,12 +26,11 @@ use arrow::datatypes::{ ArrowTimestampType, DataType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, }; -use chrono::{DateTime, MappedLocalTime, Offset, TimeDelta, TimeZone, Utc}; +use crate::datetime::adjust_to_local_time; use datafusion_common::cast::as_primitive_array; use datafusion_common::{ - exec_err, internal_datafusion_err, plan_err, utils::take_function_args, Result, - ScalarValue, + exec_err, plan_err, utils::take_function_args, Result, ScalarValue, }; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, @@ -293,56 +291,6 @@ impl ToLocalTimeFunc { /// ``` /// /// See `test_adjust_to_local_time()` for example -fn adjust_to_local_time(ts: i64, tz: Tz) -> Result { - fn convert_timestamp(ts: i64, converter: F) -> Result> - where - F: Fn(i64) -> MappedLocalTime>, - { - match converter(ts) { - MappedLocalTime::Ambiguous(earliest, latest) => exec_err!( - "Ambiguous timestamp. Do you mean {:?} or {:?}", - earliest, - latest - ), - MappedLocalTime::None => exec_err!( - "The local time does not exist because there is a gap in the local time." - ), - MappedLocalTime::Single(date_time) => Ok(date_time), - } - } - - let date_time = match T::UNIT { - Nanosecond => Utc.timestamp_nanos(ts), - Microsecond => convert_timestamp(ts, |ts| Utc.timestamp_micros(ts))?, - Millisecond => convert_timestamp(ts, |ts| Utc.timestamp_millis_opt(ts))?, - Second => convert_timestamp(ts, |ts| Utc.timestamp_opt(ts, 0))?, - }; - - let offset_seconds: i64 = tz - .offset_from_utc_datetime(&date_time.naive_utc()) - .fix() - .local_minus_utc() as i64; - - let adjusted_date_time = date_time.add( - // This should not fail under normal circumstances as the - // maximum possible offset is 26 hours (93,600 seconds) - TimeDelta::try_seconds(offset_seconds) - .ok_or_else(|| internal_datafusion_err!("Offset seconds should be less than i64::MAX / 1_000 or greater than -i64::MAX / 1_000"))?, - ); - - // convert the naive datetime back to i64 - match T::UNIT { - Nanosecond => adjusted_date_time.timestamp_nanos_opt().ok_or_else(|| - internal_datafusion_err!( - "Failed to convert DateTime to timestamp in nanosecond. This error may occur if the date is out of range. The supported date ranges are between 1677-09-21T00:12:43.145224192 and 2262-04-11T23:47:16.854775807" - ) - ), - Microsecond => Ok(adjusted_date_time.timestamp_micros()), - Millisecond => Ok(adjusted_date_time.timestamp_millis()), - Second => Ok(adjusted_date_time.timestamp()), - } -} - impl ScalarUDFImpl for ToLocalTimeFunc { fn as_any(&self) -> &dyn Any { self diff --git a/datafusion/sqllogictest/test_files/extract_tz.slt b/datafusion/sqllogictest/test_files/extract_tz.slt index 32e6b0fbfbb6..c13c37e15c14 100644 --- a/datafusion/sqllogictest/test_files/extract_tz.slt +++ b/datafusion/sqllogictest/test_files/extract_tz.slt @@ -90,4 +90,23 @@ SELECT EXTRACT(HOUR FROM TIMESTAMP '2023-10-30 18:20:59+04:00'), ---- 22 20 59 +query II +SELECT EXTRACT(HOUR FROM TIMESTAMP '2025-10-30 10:45:30+02:30'), + EXTRACT(MINUTE FROM TIMESTAMP '2023-10-30 18:20:59-04:30'); +---- +13 50 + +#query I +#SELECT EXTRACT(HOUR FROM CAST('2025-10-30 10:45:30' AS TIMESTAMP) AT TIME ZONE 'Asia/Tokyo'); +#---- +#19 + +query III +SELECT EXTRACT(HOUR FROM TIMESTAMP '2023-10-30 18:20:59+08:00'), + EXTRACT(DAY FROM TIMESTAMP '2023-10-30 18:20:59+07:00'), + EXTRACT(DAY FROM TIMESTAMP '2023-10-30 07:20:59-12:00'); +---- +2 31 29 + + diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 7a9dfe151961..b72f73d44698 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4345,17 +4345,17 @@ EXPLAIN SELECT extract(month from ts) as months ---- logical_plan 01)Sort: months DESC NULLS FIRST, fetch=5 -02)--Projection: extract(Utf8("MONTH"),csv_with_timestamps.ts) AS months -03)----Aggregate: groupBy=[[extract(Utf8("MONTH"), csv_with_timestamps.ts)]], aggr=[[]] +02)--Projection: date_part(Utf8("MONTH"),csv_with_timestamps.ts) AS months +03)----Aggregate: groupBy=[[date_part(Utf8("MONTH"), csv_with_timestamps.ts)]], aggr=[[]] 04)------TableScan: csv_with_timestamps projection=[ts] physical_plan 01)SortPreservingMergeExec: [months@0 DESC], fetch=5 02)--SortExec: TopK(fetch=5), expr=[months@0 DESC], preserve_partitioning=[true] -03)----ProjectionExec: expr=[extract(Utf8("MONTH"),csv_with_timestamps.ts)@0 as months] -04)------AggregateExec: mode=FinalPartitioned, gby=[extract(Utf8("MONTH"),csv_with_timestamps.ts)@0 as extract(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[] +03)----ProjectionExec: expr=[date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0 as months] +04)------AggregateExec: mode=FinalPartitioned, gby=[date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0 as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[] 05)--------CoalesceBatchesExec: target_batch_size=2 -06)----------RepartitionExec: partitioning=Hash([extract(Utf8("MONTH"),csv_with_timestamps.ts)@0], 8), input_partitions=8 -07)------------AggregateExec: mode=Partial, gby=[extract(MONTH, ts@0) as extract(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[] +06)----------RepartitionExec: partitioning=Hash([date_part(Utf8("MONTH"),csv_with_timestamps.ts)@0], 8), input_partitions=8 +07)------------AggregateExec: mode=Partial, gby=[date_part(MONTH, ts@0) as date_part(Utf8("MONTH"),csv_with_timestamps.ts)], aggr=[] 08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/timestamps.csv]]}, projection=[ts], output_ordering=[ts@0 DESC], file_type=csv, has_header=false diff --git a/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt b/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt index 9a666595ac57..de6a153f58d9 100644 --- a/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt +++ b/datafusion/sqllogictest/test_files/optimizer_group_by_constant.slt @@ -90,7 +90,7 @@ FROM test_table t GROUP BY 1 ---- logical_plan -01)Projection: Boolean(true) AS NOT extract(Utf8("MONTH"),now()) BETWEEN Int64(50) AND Int64(60), count(Int64(1)) +01)Projection: Boolean(true) AS NOT date_part(Utf8("MONTH"),now()) BETWEEN Int64(50) AND Int64(60), count(Int64(1)) 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] 03)----SubqueryAlias: t 04)------TableScan: test_table projection=[] diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part index 12b06bb485fb..291d56e43f2d 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part @@ -62,7 +62,7 @@ logical_plan 02)--Projection: shipping.supp_nation, shipping.cust_nation, shipping.l_year, sum(shipping.volume) AS revenue 03)----Aggregate: groupBy=[[shipping.supp_nation, shipping.cust_nation, shipping.l_year]], aggr=[[sum(shipping.volume)]] 04)------SubqueryAlias: shipping -05)--------Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, extract(Utf8("YEAR"), lineitem.l_shipdate) AS l_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume +05)--------Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, date_part(Utf8("YEAR"), lineitem.l_shipdate) AS l_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume 06)----------Inner Join: customer.c_nationkey = n2.n_nationkey Filter: n1.n_name = Utf8View("FRANCE") AND n2.n_name = Utf8View("GERMANY") OR n1.n_name = Utf8View("GERMANY") AND n2.n_name = Utf8View("FRANCE") 07)------------Projection: lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate, customer.c_nationkey, n1.n_name 08)--------------Inner Join: supplier.s_nationkey = n1.n_nationkey @@ -91,7 +91,7 @@ physical_plan 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([supp_nation@0, cust_nation@1, l_year@2], 4), input_partitions=4 07)------------AggregateExec: mode=Partial, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)] -08)--------------ProjectionExec: expr=[n_name@3 as supp_nation, n_name@4 as cust_nation, extract(YEAR, l_shipdate@2) as l_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume] +08)--------------ProjectionExec: expr=[n_name@3 as supp_nation, n_name@4 as cust_nation, date_part(YEAR, l_shipdate@2) as l_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume] 09)----------------CoalesceBatchesExec: target_batch_size=8192 10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_nationkey@3, n_nationkey@0)], filter=n_name@0 = FRANCE AND n_name@1 = GERMANY OR n_name@0 = GERMANY AND n_name@1 = FRANCE, projection=[l_extendedprice@0, l_discount@1, l_shipdate@2, n_name@4, n_name@6] 11)--------------------CoalesceBatchesExec: target_batch_size=8192 diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part index a500f89f5f4b..a8a5f3d2636f 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part @@ -60,7 +60,7 @@ logical_plan 02)--Projection: all_nations.o_year, CAST(CAST(sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END) AS Decimal128(12, 2)) / CAST(sum(all_nations.volume) AS Decimal128(12, 2)) AS Decimal128(15, 2)) AS mkt_share 03)----Aggregate: groupBy=[[all_nations.o_year]], aggr=[[sum(CASE WHEN all_nations.nation = Utf8View("BRAZIL") THEN all_nations.volume ELSE Decimal128(Some(0),38,4) END) AS sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]] 04)------SubqueryAlias: all_nations -05)--------Projection: extract(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume, n2.n_name AS nation +05)--------Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, date_part(Utf8("YEAR"), lineitem.l_shipdate) AS l_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume 06)----------Inner Join: n1.n_regionkey = region.r_regionkey 07)------------Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n1.n_regionkey, n2.n_name 08)--------------Inner Join: supplier.s_nationkey = n2.n_nationkey diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part index 611a05e7371e..3b31c1bc2e8e 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part @@ -56,7 +56,7 @@ logical_plan 02)--Projection: profit.nation, profit.o_year, sum(profit.amount) AS sum_profit 03)----Aggregate: groupBy=[[profit.nation, profit.o_year]], aggr=[[sum(profit.amount)]] 04)------SubqueryAlias: profit -05)--------Projection: nation.n_name AS nation, extract(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) - partsupp.ps_supplycost * lineitem.l_quantity AS amount +05)--------Projection: nation.n_name AS nation, date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) - partsupp.ps_supplycost * lineitem.l_quantity AS amount 06)----------Inner Join: supplier.s_nationkey = nation.n_nationkey 07)------------Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost, orders.o_orderdate 08)--------------Inner Join: lineitem.l_orderkey = orders.o_orderkey @@ -82,7 +82,7 @@ physical_plan 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([nation@0, o_year@1], 4), input_partitions=4 07)------------AggregateExec: mode=Partial, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)] -08)--------------ProjectionExec: expr=[n_name@5 as nation, extract(YEAR, o_orderdate@4) as o_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) - ps_supplycost@3 * l_quantity@0 as amount] +08)--------------ProjectionExec: expr=[n_name@5 as nation, date_part(YEAR, o_orderdate@4) as o_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) - ps_supplycost@3 * l_quantity@0 as amount] 09)----------------CoalesceBatchesExec: target_batch_size=8192 10)------------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_nationkey@3, n_nationkey@0)], projection=[l_quantity@0, l_extendedprice@1, l_discount@2, ps_supplycost@4, o_orderdate@5, n_name@7] 11)--------------------CoalesceBatchesExec: target_batch_size=8192 From 8cd5d2ea3a730a65f320e083af6e1204269f6e83 Mon Sep 17 00:00:00 2001 From: sriram Date: Mon, 10 Nov 2025 20:19:16 +0530 Subject: [PATCH 009/157] CI Fix. --- datafusion/functions/src/datetime/date_part.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 4a387a0d1641..c7f81e3571f0 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -46,7 +46,7 @@ use datafusion_common::{ exec_err, internal_err, not_impl_err, types::logical_string, utils::take_function_args, - Result, ScalarValue, + DataFusionError, Result, ScalarValue, }; use datafusion_expr::{ ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, @@ -236,10 +236,9 @@ impl ScalarUDFImpl for DatePartFunc { } } else if let Timestamp(time_unit, None) = array.data_type() { // For naive timestamps, interpret in session timezone - let tz = match config.execution.time_zone.parse::() { - Ok(tz) => tz, - Err(_) => return exec_err!("Invalid timezone"), - }; + let tz: Tz = config.execution.time_zone.parse().map_err(|_| { + DataFusionError::Execution("Invalid timezone".to_string()) + })?; match time_unit { Nanosecond => { adjust_timestamp_array::(&array, tz)? From 6ec6e468258bd4031fd5fcadccaa0dc24c8a37ab Mon Sep 17 00:00:00 2001 From: sriram Date: Mon, 10 Nov 2025 22:16:16 +0530 Subject: [PATCH 010/157] CI Fix (i). --- datafusion/functions/src/datetime/date_part.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index c7f81e3571f0..f918c752bdeb 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -236,7 +236,7 @@ impl ScalarUDFImpl for DatePartFunc { } } else if let Timestamp(time_unit, None) = array.data_type() { // For naive timestamps, interpret in session timezone - let tz: Tz = config.execution.time_zone.parse().map_err(|_| { + let tz: Tz = config.execution.time_zone.as_str().parse().map_err(|_| { DataFusionError::Execution("Invalid timezone".to_string()) })?; match time_unit { From cd7c5e8c31955202c03fc8d41a6281ea0ebd2084 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sun, 26 Oct 2025 19:11:17 -0500 Subject: [PATCH 011/157] Push partition_statistics into DataSource (#18233) Removes a downcast match in favor of use of the trait. This mirrors the changes to DataSourceExec to use partition_statistics instead of statistics from https://github.com/apache/datafusion/pull/15852 --- datafusion/datasource/src/file_scan_config.rs | 124 +++++++++++++++++- datafusion/datasource/src/memory.rs | 28 +++- datafusion/datasource/src/source.rs | 32 ++--- 3 files changed, 159 insertions(+), 25 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 4dfb6a4ec3d3..695252803bae 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -598,8 +598,39 @@ impl DataSource for FileScanConfig { SchedulingType::Cooperative } - fn statistics(&self) -> Result { - Ok(self.projected_stats()) + fn partition_statistics(&self, partition: Option) -> Result { + if let Some(partition) = partition { + // Get statistics for a specific partition + if let Some(file_group) = self.file_groups.get(partition) { + if let Some(stat) = file_group.file_statistics(None) { + // Project the statistics based on the projection + let table_cols_stats = self + .projection_indices() + .into_iter() + .map(|idx| { + if idx < self.file_schema().fields().len() { + stat.column_statistics[idx].clone() + } else { + // TODO provide accurate stat for partition column + // See https://github.com/apache/datafusion/issues/1186 + ColumnStatistics::new_unknown() + } + }) + .collect(); + + return Ok(Statistics { + num_rows: stat.num_rows, + total_byte_size: stat.total_byte_size, + column_statistics: table_cols_stats, + }); + } + } + // If no statistics available for this partition, return unknown + Ok(Statistics::new_unknown(&self.projected_schema())) + } else { + // Return aggregate statistics across all partitions + Ok(self.projected_stats()) + } } fn with_fetch(&self, limit: Option) -> Option> { @@ -1603,7 +1634,7 @@ mod tests { ); let source_statistics = conf.file_source.statistics().unwrap(); - let conf_stats = conf.statistics().unwrap(); + let conf_stats = conf.partition_statistics(None).unwrap(); // projection should be reflected in the file source statistics assert_eq!(conf_stats.num_rows, Precision::Inexact(3)); @@ -2510,4 +2541,91 @@ mod tests { Ok(()) } + + #[test] + fn test_partition_statistics_projection() { + // This test verifies that partition_statistics applies projection correctly. + // The old implementation had a bug where it returned file group statistics + // without applying the projection, returning all column statistics instead + // of just the projected ones. + + use crate::source::DataSourceExec; + use datafusion_physical_plan::ExecutionPlan; + + // Create a schema with 4 columns + let schema = Arc::new(Schema::new(vec![ + Field::new("col0", DataType::Int32, false), + Field::new("col1", DataType::Int32, false), + Field::new("col2", DataType::Int32, false), + Field::new("col3", DataType::Int32, false), + ])); + + // Create statistics for all 4 columns + let file_group_stats = Statistics { + num_rows: Precision::Exact(100), + total_byte_size: Precision::Exact(1024), + column_statistics: vec![ + ColumnStatistics { + null_count: Precision::Exact(0), + ..ColumnStatistics::new_unknown() + }, + ColumnStatistics { + null_count: Precision::Exact(5), + ..ColumnStatistics::new_unknown() + }, + ColumnStatistics { + null_count: Precision::Exact(10), + ..ColumnStatistics::new_unknown() + }, + ColumnStatistics { + null_count: Precision::Exact(15), + ..ColumnStatistics::new_unknown() + }, + ], + }; + + // Create a file group with statistics + let file_group = FileGroup::new(vec![PartitionedFile::new("test.parquet", 1024)]) + .with_statistics(Arc::new(file_group_stats)); + + // Create a FileScanConfig with projection: only keep columns 0 and 2 + let config = FileScanConfigBuilder::new( + ObjectStoreUrl::parse("test:///").unwrap(), + Arc::clone(&schema), + Arc::new(MockSource::default()), + ) + .with_projection(Some(vec![0, 2])) // Only project columns 0 and 2 + .with_file_groups(vec![file_group]) + .build(); + + // Create a DataSourceExec from the config + let exec = DataSourceExec::from_data_source(config); + + // Get statistics for partition 0 + let partition_stats = exec.partition_statistics(Some(0)).unwrap(); + + // Verify that only 2 columns are in the statistics (the projected ones) + assert_eq!( + partition_stats.column_statistics.len(), + 2, + "Expected 2 column statistics (projected), but got {}", + partition_stats.column_statistics.len() + ); + + // Verify the column statistics are for columns 0 and 2 + assert_eq!( + partition_stats.column_statistics[0].null_count, + Precision::Exact(0), + "First projected column should be col0 with 0 nulls" + ); + assert_eq!( + partition_stats.column_statistics[1].null_count, + Precision::Exact(10), + "Second projected column should be col2 with 10 nulls" + ); + + // Verify row count and byte size are preserved + assert_eq!(partition_stats.num_rows, Precision::Exact(100)); + assert_eq!(partition_stats.total_byte_size, Precision::Exact(1024)); + } } diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs index eb55aa9b0b0d..7d5c8c4834ea 100644 --- a/datafusion/datasource/src/memory.rs +++ b/datafusion/datasource/src/memory.rs @@ -21,6 +21,7 @@ use std::collections::BinaryHeap; use std::fmt; use std::fmt::Debug; use std::ops::Deref; +use std::slice::from_ref; use std::sync::Arc; use crate::sink::DataSink; @@ -192,12 +193,27 @@ impl DataSource for MemorySourceConfig { SchedulingType::Cooperative } - fn statistics(&self) -> Result { - Ok(common::compute_record_batch_statistics( - &self.partitions, - &self.schema, - self.projection.clone(), - )) + fn partition_statistics(&self, partition: Option) -> Result { + if let Some(partition) = partition { + // Compute statistics for a specific partition + if let Some(batches) = self.partitions.get(partition) { + Ok(common::compute_record_batch_statistics( + from_ref(batches), + &self.schema, + self.projection.clone(), + )) + } else { + // Invalid partition index + Ok(Statistics::new_unknown(&self.projected_schema)) + } + } else { + // Compute statistics across all partitions + Ok(common::compute_record_batch_statistics( + &self.partitions, + &self.schema, + self.projection.clone(), + )) + } } fn with_fetch(&self, limit: Option) -> Option> { diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 20d9a1d6e53f..11a8a3867b80 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -151,7 +151,21 @@ pub trait DataSource: Send + Sync + Debug { fn scheduling_type(&self) -> SchedulingType { SchedulingType::NonCooperative } - fn statistics(&self) -> Result; + + /// Returns statistics for a specific partition, or aggregate statistics + /// across all partitions if `partition` is `None`. + fn partition_statistics(&self, partition: Option) -> Result; + + /// Returns aggregate statistics across all partitions. + /// + /// # Deprecated + /// Use [`Self::partition_statistics`] instead, which provides more fine-grained + /// control over statistics retrieval (per-partition or aggregate). + #[deprecated(since = "51.0.0", note = "Use partition_statistics instead")] + fn statistics(&self) -> Result { + self.partition_statistics(None) + } + /// Return a copy of this DataSource with a new fetch limit fn with_fetch(&self, _limit: Option) -> Option>; fn fetch(&self) -> Option; @@ -285,21 +299,7 @@ impl ExecutionPlan for DataSourceExec { } fn partition_statistics(&self, partition: Option) -> Result { - if let Some(partition) = partition { - let mut statistics = Statistics::new_unknown(&self.schema()); - if let Some(file_config) = - self.data_source.as_any().downcast_ref::() - { - if let Some(file_group) = file_config.file_groups.get(partition) { - if let Some(stat) = file_group.file_statistics(None) { - statistics = stat.clone(); - } - } - } - Ok(statistics) - } else { - Ok(self.data_source.statistics()?) - } + self.data_source.partition_statistics(partition) } fn with_fetch(&self, limit: Option) -> Option> { From e15b0563dcccc44c0738b18321616022d57053ef Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Mon, 27 Oct 2025 12:45:21 +0800 Subject: [PATCH 012/157] feat: Add `output_bytes` to baseline metrics (#18268) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes https://github.com/apache/datafusion/issues/16244 ## Rationale for this change Support `output_bytes` in `BaselineMetrics` (a common metrics set for almost all operators) ``` DataFusion CLI v50.3.0 > explain analyze select * from generate_series(1, 1000000) as t1(v1) order by v1 desc; +-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | plan_type | plan | +-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Plan with Metrics | SortExec: expr=[v1@0 DESC], preserve_partitioning=[false], metrics=[output_rows=1000000, elapsed_compute=96.421534ms, output_bytes=7.6 MB, spill_count=0, spilled_bytes=0.0 B, spilled_rows=0, batches_split=0] | | | ProjectionExec: expr=[value@0 as v1], metrics=[output_rows=1000000, elapsed_compute=34.125µs, output_bytes=7.7 MB] | | | LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=1000000, batch_size=8192], metrics=[output_rows=1000000, elapsed_compute=2.262626ms, output_bytes=7.7 MB] | | | | +-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 1 row(s) fetched. Elapsed 0.080 seconds. ``` Note it might overestimate memory due to a well-known issue. See the PR snippet for details ```rs /// Memory usage of all output batches. /// /// Note: This value may be overestimated. If multiple output `RecordBatch` /// instances share underlying memory buffers, their sizes will be counted /// multiple times. /// Issue: output_bytes: Count, ``` I think this metric provides valuable insight, so it's better for it to overestimate than not exist at all. ## What changes are included in this PR? 1. Add `output_bytes` to `BaselineMetrics`, and it's set to `summary` analyze level. (see config `datafusion.explain.analyze_level` for details) 2. This metrics will be automatically tracked through `record_poll()` API, which is a common interface most operators uses when a new output batch is generated. ## Are these changes tested? UT ## Are there any user-facing changes? --- datafusion/core/tests/sql/explain_analyze.rs | 23 +++++++++++ .../physical-plan/src/metrics/baseline.rs | 20 ++++++++++ .../physical-plan/src/metrics/builder.rs | 8 ++++ datafusion/physical-plan/src/metrics/mod.rs | 1 + datafusion/physical-plan/src/metrics/value.rs | 38 ++++++++++++------- docs/source/user-guide/metrics.md | 9 +++-- 6 files changed, 82 insertions(+), 17 deletions(-) diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 6d386cc456d8..43f79ead0257 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -63,36 +63,59 @@ async fn explain_analyze_baseline_metrics() { "AggregateExec: mode=Partial, gby=[]", "metrics=[output_rows=3, elapsed_compute=" ); + assert_metrics!( + &formatted, + "AggregateExec: mode=Partial, gby=[]", + "output_bytes=" + ); assert_metrics!( &formatted, "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]", "metrics=[output_rows=5, elapsed_compute=" ); + assert_metrics!( + &formatted, + "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1]", + "output_bytes=" + ); assert_metrics!( &formatted, "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434", "metrics=[output_rows=99, elapsed_compute=" ); + assert_metrics!( + &formatted, + "FilterExec: c13@1 != C2GT5KVyOPZpgKVl110TyZO0NcJ434", + "output_bytes=" + ); assert_metrics!( &formatted, "ProjectionExec: expr=[]", "metrics=[output_rows=5, elapsed_compute=" ); + assert_metrics!(&formatted, "ProjectionExec: expr=[]", "output_bytes="); assert_metrics!( &formatted, "CoalesceBatchesExec: target_batch_size=4096", "metrics=[output_rows=5, elapsed_compute" ); + assert_metrics!( + &formatted, + "CoalesceBatchesExec: target_batch_size=4096", + "output_bytes=" + ); assert_metrics!( &formatted, "UnionExec", "metrics=[output_rows=3, elapsed_compute=" ); + assert_metrics!(&formatted, "UnionExec", "output_bytes="); assert_metrics!( &formatted, "WindowAggExec", "metrics=[output_rows=1, elapsed_compute=" ); + assert_metrics!(&formatted, "WindowAggExec", "output_bytes="); fn expected_to_have_metrics(plan: &dyn ExecutionPlan) -> bool { use datafusion::physical_plan; diff --git a/datafusion/physical-plan/src/metrics/baseline.rs b/datafusion/physical-plan/src/metrics/baseline.rs index 45cef58b5dd8..858773b94664 100644 --- a/datafusion/physical-plan/src/metrics/baseline.rs +++ b/datafusion/physical-plan/src/metrics/baseline.rs @@ -21,6 +21,8 @@ use std::task::Poll; use arrow::record_batch::RecordBatch; +use crate::spill::get_record_batch_memory_size; + use super::{Count, ExecutionPlanMetricsSet, MetricBuilder, Time, Timestamp}; use datafusion_common::Result; @@ -53,6 +55,16 @@ pub struct BaselineMetrics { /// output rows: the total output rows output_rows: Count, + + /// Memory usage of all output batches. + /// + /// Note: This value may be overestimated. If multiple output `RecordBatch` + /// instances share underlying memory buffers, their sizes will be counted + /// multiple times. + /// Issue: + output_bytes: Count, + // Remember to update `docs/source/user-guide/metrics.md` when updating comments + // or adding new metrics } impl BaselineMetrics { @@ -71,6 +83,9 @@ impl BaselineMetrics { output_rows: MetricBuilder::new(metrics) .with_type(super::MetricType::SUMMARY) .output_rows(partition), + output_bytes: MetricBuilder::new(metrics) + .with_type(super::MetricType::SUMMARY) + .output_bytes(partition), } } @@ -84,6 +99,7 @@ impl BaselineMetrics { end_time: Default::default(), elapsed_compute: self.elapsed_compute.clone(), output_rows: Default::default(), + output_bytes: Default::default(), } } @@ -211,6 +227,8 @@ impl RecordOutput for usize { impl RecordOutput for RecordBatch { fn record_output(self, bm: &BaselineMetrics) -> Self { bm.record_output(self.num_rows()); + let n_bytes = get_record_batch_memory_size(&self); + bm.output_bytes.add(n_bytes); self } } @@ -218,6 +236,8 @@ impl RecordOutput for RecordBatch { impl RecordOutput for &RecordBatch { fn record_output(self, bm: &BaselineMetrics) -> Self { bm.record_output(self.num_rows()); + let n_bytes = get_record_batch_memory_size(self); + bm.output_bytes.add(n_bytes); self } } diff --git a/datafusion/physical-plan/src/metrics/builder.rs b/datafusion/physical-plan/src/metrics/builder.rs index 74ba5a2a1834..88ec1a3f67d1 100644 --- a/datafusion/physical-plan/src/metrics/builder.rs +++ b/datafusion/physical-plan/src/metrics/builder.rs @@ -151,6 +151,14 @@ impl<'a> MetricBuilder<'a> { count } + /// Consume self and create a new counter for recording total output bytes + pub fn output_bytes(self, partition: usize) -> Count { + let count = Count::new(); + self.with_partition(partition) + .build(MetricValue::OutputBytes(count.clone())); + count + } + /// Consume self and create a new gauge for reporting current memory usage pub fn mem_used(self, partition: usize) -> Gauge { let gauge = Gauge::new(); diff --git a/datafusion/physical-plan/src/metrics/mod.rs b/datafusion/physical-plan/src/metrics/mod.rs index 0fd7bfb8c812..02aad6eb60ac 100644 --- a/datafusion/physical-plan/src/metrics/mod.rs +++ b/datafusion/physical-plan/src/metrics/mod.rs @@ -296,6 +296,7 @@ impl MetricsSet { MetricValue::ElapsedCompute(_) => false, MetricValue::SpillCount(_) => false, MetricValue::SpilledBytes(_) => false, + MetricValue::OutputBytes(_) => false, MetricValue::SpilledRows(_) => false, MetricValue::CurrentMemoryUsage(_) => false, MetricValue::Gauge { name, .. } => name == metric_name, diff --git a/datafusion/physical-plan/src/metrics/value.rs b/datafusion/physical-plan/src/metrics/value.rs index 3149fca95ba8..fc947935503c 100644 --- a/datafusion/physical-plan/src/metrics/value.rs +++ b/datafusion/physical-plan/src/metrics/value.rs @@ -395,6 +395,8 @@ pub enum MetricValue { SpillCount(Count), /// Total size of spilled bytes produced: "spilled_bytes" metric SpilledBytes(Count), + /// Total size of output bytes produced: "output_bytes" metric + OutputBytes(Count), /// Total size of spilled rows produced: "spilled_rows" metric SpilledRows(Count), /// Current memory used @@ -449,6 +451,9 @@ impl PartialEq for MetricValue { (MetricValue::SpilledBytes(count), MetricValue::SpilledBytes(other)) => { count == other } + (MetricValue::OutputBytes(count), MetricValue::OutputBytes(other)) => { + count == other + } (MetricValue::SpilledRows(count), MetricValue::SpilledRows(other)) => { count == other } @@ -505,6 +510,7 @@ impl MetricValue { Self::OutputRows(_) => "output_rows", Self::SpillCount(_) => "spill_count", Self::SpilledBytes(_) => "spilled_bytes", + Self::OutputBytes(_) => "output_bytes", Self::SpilledRows(_) => "spilled_rows", Self::CurrentMemoryUsage(_) => "mem_used", Self::ElapsedCompute(_) => "elapsed_compute", @@ -523,6 +529,7 @@ impl MetricValue { Self::OutputRows(count) => count.value(), Self::SpillCount(count) => count.value(), Self::SpilledBytes(bytes) => bytes.value(), + Self::OutputBytes(bytes) => bytes.value(), Self::SpilledRows(count) => count.value(), Self::CurrentMemoryUsage(used) => used.value(), Self::ElapsedCompute(time) => time.value(), @@ -550,6 +557,7 @@ impl MetricValue { Self::OutputRows(_) => Self::OutputRows(Count::new()), Self::SpillCount(_) => Self::SpillCount(Count::new()), Self::SpilledBytes(_) => Self::SpilledBytes(Count::new()), + Self::OutputBytes(_) => Self::OutputBytes(Count::new()), Self::SpilledRows(_) => Self::SpilledRows(Count::new()), Self::CurrentMemoryUsage(_) => Self::CurrentMemoryUsage(Gauge::new()), Self::ElapsedCompute(_) => Self::ElapsedCompute(Time::new()), @@ -588,6 +596,7 @@ impl MetricValue { (Self::OutputRows(count), Self::OutputRows(other_count)) | (Self::SpillCount(count), Self::SpillCount(other_count)) | (Self::SpilledBytes(count), Self::SpilledBytes(other_count)) + | (Self::OutputBytes(count), Self::OutputBytes(other_count)) | (Self::SpilledRows(count), Self::SpilledRows(other_count)) | ( Self::Count { count, .. }, @@ -638,18 +647,21 @@ impl MetricValue { /// numbers are "more useful" (and displayed first) pub fn display_sort_key(&self) -> u8 { match self { - Self::OutputRows(_) => 0, // show first - Self::ElapsedCompute(_) => 1, // show second - Self::SpillCount(_) => 2, - Self::SpilledBytes(_) => 3, - Self::SpilledRows(_) => 4, - Self::CurrentMemoryUsage(_) => 5, - Self::Count { .. } => 6, - Self::Gauge { .. } => 7, - Self::Time { .. } => 8, - Self::StartTimestamp(_) => 9, // show timestamps last - Self::EndTimestamp(_) => 10, - Self::Custom { .. } => 11, + // `BaselineMetrics` that is common for most operators + Self::OutputRows(_) => 0, + Self::ElapsedCompute(_) => 1, + Self::OutputBytes(_) => 2, + // Other metrics + Self::SpillCount(_) => 3, + Self::SpilledBytes(_) => 4, + Self::SpilledRows(_) => 5, + Self::CurrentMemoryUsage(_) => 6, + Self::Count { .. } => 7, + Self::Gauge { .. } => 8, + Self::Time { .. } => 9, + Self::StartTimestamp(_) => 10, // show timestamps last + Self::EndTimestamp(_) => 11, + Self::Custom { .. } => 12, } } @@ -669,7 +681,7 @@ impl Display for MetricValue { | Self::Count { count, .. } => { write!(f, "{count}") } - Self::SpilledBytes(count) => { + Self::SpilledBytes(count) | Self::OutputBytes(count) => { let readable_count = human_readable_size(count.value()); write!(f, "{readable_count}") } diff --git a/docs/source/user-guide/metrics.md b/docs/source/user-guide/metrics.md index f2634b901518..1fb2f4a5c770 100644 --- a/docs/source/user-guide/metrics.md +++ b/docs/source/user-guide/metrics.md @@ -27,10 +27,11 @@ DataFusion operators expose runtime metrics so you can understand where time is `BaselineMetrics` are available in most physical operators to capture common measurements. -| Metric | Description | -| --------------- | ------------------------------------------------------ | -| elapsed_compute | CPU time the operator actively spends processing work. | -| output_rows | Total number of rows the operator produces. | +| Metric | Description | +| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| elapsed_compute | CPU time the operator actively spends processing work. | +| output_rows | Total number of rows the operator produces. | +| output_bytes | Memory usage of all output batches. Note: This value may be overestimated. If multiple output `RecordBatch` instances share underlying memory buffers, their sizes will be counted multiple times. | ## Operator-specific Metrics From b291f33decc76439f75f69b4e3dc98096e840107 Mon Sep 17 00:00:00 2001 From: Aryamaan Singh <71913204+toxicteddy00077@users.noreply.github.com> Date: Mon, 27 Oct 2025 12:25:39 +0530 Subject: [PATCH 013/157] Fix: Error rather than silently ignore extra parameter passed to ceil/floor (#18265) ## Which issue does this PR close? - Closes #18175 ## Rationale for this change The Ceil/Floor calls via SQL was being parsed such that they were taking 2 arguments instead of 1, the second of which is not currently needed and the second argument was being ignored and passed silently. ## What changes are included in this PR? The second parameter(`field`) which was being passed if is of the `CeilFloorKind` enum from `sqlparser` crate . Neither of the enum's two variants (`DateTimeField` and `Scale`)are being implemented hence they have been ignored with apporpriate error type and only succeeds if the `DateTimeField` has `NoDateTime` variant i,e it is treated as empty. ## Are these changes tested? All Unit Tests pass successfully. --------- Co-authored-by: Andrew Lamb --- datafusion/sql/src/expr/mod.rs | 37 +++++++++++++------ datafusion/sqllogictest/test_files/scalar.slt | 16 ++++++++ 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index a016f28db417..035250adfdbf 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -20,9 +20,10 @@ use datafusion_expr::planner::{ PlannerResult, RawBinaryExpr, RawDictionaryExpr, RawFieldAccessExpr, }; use sqlparser::ast::{ - AccessExpr, BinaryOperator, CastFormat, CastKind, DataType as SQLDataType, - DictionaryField, Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias, MapEntry, - StructField, Subscript, TrimWhereField, TypedString, Value, ValueWithSpan, + AccessExpr, BinaryOperator, CastFormat, CastKind, CeilFloorKind, + DataType as SQLDataType, DateTimeField, DictionaryField, Expr as SQLExpr, + ExprWithAlias as SQLExprWithAlias, MapEntry, StructField, Subscript, TrimWhereField, + TypedString, Value, ValueWithSpan, }; use std::sync::Arc; @@ -510,14 +511,28 @@ impl SqlToRel<'_, S> { self.sql_grouping_sets_to_expr(exprs, schema, planner_context) } - SQLExpr::Floor { - expr, - field: _field, - } => self.sql_fn_name_to_expr(*expr, "floor", schema, planner_context), - SQLExpr::Ceil { - expr, - field: _field, - } => self.sql_fn_name_to_expr(*expr, "ceil", schema, planner_context), + SQLExpr::Floor { expr, field } => match field { + CeilFloorKind::DateTimeField(DateTimeField::NoDateTime) => { + self.sql_fn_name_to_expr(*expr, "floor", schema, planner_context) + } + CeilFloorKind::DateTimeField(_) => { + not_impl_err!("FLOOR with datetime is not supported") + } + CeilFloorKind::Scale(_) => { + not_impl_err!("FLOOR with scale is not supported") + } + }, + SQLExpr::Ceil { expr, field } => match field { + CeilFloorKind::DateTimeField(DateTimeField::NoDateTime) => { + self.sql_fn_name_to_expr(*expr, "ceil", schema, planner_context) + } + CeilFloorKind::DateTimeField(_) => { + not_impl_err!("CEIL with datetime is not supported") + } + CeilFloorKind::Scale(_) => { + not_impl_err!("CEIL with scale is not supported") + } + }, SQLExpr::Overlay { expr, overlay_what, diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index b0e200015dfd..faa0d69ae84b 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -309,6 +309,14 @@ select ceil(a), ceil(b), ceil(c) from small_floats; 1 0 0 1 0 1 +# ceil with scale parameter(Scale not supported) +query error DataFusion error: This feature is not implemented: CEIL with scale is not supported +select ceil(100.1234, 1) + +# ceil with datetime parameter (not supported) +query error DataFusion error: This feature is not implemented: CEIL with datetime is not supported +select ceil(100.1234 to year) + ## degrees # degrees scalar function @@ -448,6 +456,14 @@ select floor(a), floor(b), floor(c) from signed_integers; 2 -1000 123 4 NULL NULL +# floor with scale parameter(Scale not supported) +query error DataFusion error: This feature is not implemented: FLOOR with scale is not supported +select floor(a, 1) + +# floor with datetime parameter ( not supported) +query error DataFusion error: This feature is not implemented: FLOOR with datetime is not supported +select floor(a to year) + ## ln # ln scalar function From 9c64644b2314f30725a48e13551ed3daa1e00fcb Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Mon, 27 Oct 2025 00:06:54 -0700 Subject: [PATCH 014/157] fix: Support Dictionary[Int32, Binary] for bitmap count spark function (#18273) ## Which issue does this PR close? Closes https://github.com/apache/datafusion/issues/18058 ## Rationale for this change When adding the bitmap_count function to Comet, we get the following error - org.apache.comet.CometNativeException: Error from DataFusion: bitmap_count expects Binary/BinaryView/FixedSizeBinary/LargeBinary as argument, got Dictionary(Int32, Binary). ## Are these changes tested? Added new UT --------- Co-authored-by: Kazantsev Maksim --- .../spark/src/function/bitmap/bitmap_count.rs | 65 +++++++++++++++++-- .../test_files/spark/bitmap/bitmap_count.slt | 32 +++++++++ 2 files changed, 91 insertions(+), 6 deletions(-) diff --git a/datafusion/spark/src/function/bitmap/bitmap_count.rs b/datafusion/spark/src/function/bitmap/bitmap_count.rs index 15bd33229a3d..56a9c5edb812 100644 --- a/datafusion/spark/src/function/bitmap/bitmap_count.rs +++ b/datafusion/spark/src/function/bitmap/bitmap_count.rs @@ -19,13 +19,13 @@ use std::any::Any; use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, BinaryArray, BinaryViewArray, FixedSizeBinaryArray, Int64Array, - LargeBinaryArray, + as_dictionary_array, Array, ArrayRef, BinaryArray, BinaryViewArray, + FixedSizeBinaryArray, Int64Array, LargeBinaryArray, }; -use arrow::datatypes::DataType; use arrow::datatypes::DataType::{ - Binary, BinaryView, FixedSizeBinary, Int64, LargeBinary, + Binary, BinaryView, Dictionary, FixedSizeBinary, LargeBinary, }; +use arrow::datatypes::{DataType, Int16Type, Int32Type, Int64Type, Int8Type}; use datafusion_common::utils::take_function_args; use datafusion_common::{internal_err, Result}; use datafusion_expr::{ @@ -71,7 +71,7 @@ impl ScalarUDFImpl for BitmapCount { } fn return_type(&self, _arg_types: &[DataType]) -> Result { - Ok(Int64) + Ok(DataType::Int64) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { @@ -90,6 +90,17 @@ macro_rules! downcast_and_count_ones { }}; } +macro_rules! downcast_dict_and_count_ones { + ($input_dict:expr, $key_array_type:ident) => {{ + let dict_array = as_dictionary_array::<$key_array_type>($input_dict); + let array = dict_array.downcast_dict::().unwrap(); + Ok(array + .into_iter() + .map(binary_count_ones) + .collect::()) + }}; +} + pub fn bitmap_count_inner(arg: &[ArrayRef]) -> Result { let [input_array] = take_function_args("bitmap_count", arg)?; @@ -100,6 +111,17 @@ pub fn bitmap_count_inner(arg: &[ArrayRef]) -> Result { FixedSizeBinary(_size) => { downcast_and_count_ones!(input_array, FixedSizeBinaryArray) } + Dictionary(k, v) if v.as_ref() == &Binary => match k.as_ref() { + DataType::Int8 => downcast_dict_and_count_ones!(input_array, Int8Type), + DataType::Int16 => downcast_dict_and_count_ones!(input_array, Int16Type), + DataType::Int32 => downcast_dict_and_count_ones!(input_array, Int32Type), + DataType::Int64 => downcast_dict_and_count_ones!(input_array, Int64Type), + data_type => { + internal_err!( + "bitmap_count does not support Dictionary({data_type}, Binary)" + ) + } + }, data_type => { internal_err!("bitmap_count does not support {data_type}") } @@ -114,8 +136,12 @@ mod tests { use crate::function::utils::test::test_scalar_function; use arrow::array::{Array, Int64Array}; use arrow::datatypes::DataType::Int64; + use arrow::datatypes::{DataType, Field}; + use datafusion_common::config::ConfigOptions; use datafusion_common::{Result, ScalarValue}; - use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + use datafusion_expr::ColumnarValue::Scalar; + use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; + use std::sync::Arc; macro_rules! test_bitmap_count_binary_invoke { ($INPUT:expr, $EXPECTED:expr) => { @@ -171,4 +197,31 @@ mod tests { ); Ok(()) } + + #[test] + fn test_dictionary_encoded_bitmap_count_invoke() -> Result<()> { + let dict = Scalar(ScalarValue::Dictionary( + Box::new(DataType::Int32), + Box::new(ScalarValue::Binary(Some(vec![0xFFu8, 0xFFu8]))), + )); + + let arg_fields = vec![Field::new( + "a", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Binary)), + true, + ) + .into()]; + let args = ScalarFunctionArgs { + args: vec![dict.clone()], + arg_fields, + number_rows: 1, + return_field: Field::new("f", Int64, true).into(), + config_options: Arc::new(ConfigOptions::default()), + }; + let udf = BitmapCount::new(); + let actual = udf.invoke_with_args(args)?; + let expect = Scalar(ScalarValue::Int64(Some(16))); + assert_eq!(*actual.into_array(1)?, *expect.into_array(1)?); + Ok(()) + } } diff --git a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt index 2789efef7bf3..39dca512226b 100644 --- a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt +++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_count.slt @@ -59,3 +59,35 @@ SELECT bitmap_count(arrow_cast(a, 'FixedSizeBinary(2)')) FROM (VALUES (X'1010'), 5 16 NULL + +query I +SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int32, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a); +---- +2 +5 +16 +NULL + +query I +SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int8, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a); +---- +2 +5 +16 +NULL + +query I +SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int16, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a); +---- +2 +5 +16 +NULL + +query I +SELECT bitmap_count(arrow_cast(a, 'Dictionary(Int64, Binary)')) FROM (VALUES (X'1010'), (X'0AB0'), (X'FFFF'), (NULL)) AS t(a); +---- +2 +5 +16 +NULL From 1c6916bb8cc70524074022c284d1671257dae401 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 27 Oct 2025 08:18:33 -0400 Subject: [PATCH 015/157] chore(deps): Update `half` to 2.7.1, ignore `RUSTSEC-2025-0111` (#18287) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes https://github.com/apache/datafusion/issues/18288 ## Rationale for this change `cargo audit` says that the current version of `half` we have in our Cargo.lock file was yanked ``` Crate: half Version: 2.7.0 Warning: yanked Dependency tree: half 2.7.0 ``` And indeed it is: https://crates.io/crates/half/versions Screenshot 2025-10-26 at 7 20 54 AM So let's update to a non yanked version ## What changes are included in this PR? run `cargo update -p half` and check the result in ## Are these changes tested? ## Are there any user-facing changes? --- .github/workflows/audit.yml | 7 ++++++- Cargo.lock | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index cae620baf46c..ac8d6ed6f993 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -46,4 +46,9 @@ jobs: with: tool: cargo-audit - name: Run audit check - run: cargo audit + # RUSTSEC-2025-0111: tokio-tar is by testcontainers for orchestration + # of testing, so does not impact DataFusion's security + # See https://github.com/apache/datafusion/issues/18288 + # NOTE: can remove this once testcontainers releases a version that includes + # https://github.com/testcontainers/testcontainers-rs/pull/852 + run: cargo audit --ignore RUSTSEC-2025-0111 diff --git a/Cargo.lock b/Cargo.lock index e368dcf9a91e..735738338c3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3255,9 +3255,9 @@ dependencies = [ [[package]] name = "half" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e54c115d4f30f52c67202f079c5f9d8b49db4691f460fdb0b4c2e838261b2ba5" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", From f8d05e850a57d800f50aa032c82ad5a9a6e19f1a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Oct 2025 08:23:03 -0400 Subject: [PATCH 016/157] chore(deps): bump taiki-e/install-action from 2.62.36 to 2.62.38 (#18293) Bumps [taiki-e/install-action](https://github.com/taiki-e/install-action) from 2.62.36 to 2.62.38.
Release notes

Sourced from taiki-e/install-action's releases.

2.62.38

  • Update coreutils@latest to 0.3.0.

  • Update wasmtime@latest to 38.0.3.

  • Update mise@latest to 2025.10.17.

  • Update cargo-tarpaulin@latest to 0.34.1.

2.62.37

  • Update cargo-binstall@latest to 1.15.8.

  • Update zizmor@latest to 1.16.0.

  • Update mise@latest to 2025.10.16.

Changelog

Sourced from taiki-e/install-action's changelog.

Changelog

All notable changes to this project will be documented in this file.

This project adheres to Semantic Versioning.

[Unreleased]

  • Update mise@latest to 2025.10.18.

[2.62.38] - 2025-10-25

  • Update coreutils@latest to 0.3.0.

  • Update wasmtime@latest to 38.0.3.

  • Update mise@latest to 2025.10.17.

  • Update cargo-tarpaulin@latest to 0.34.1.

[2.62.37] - 2025-10-24

  • Update cargo-binstall@latest to 1.15.8.

  • Update zizmor@latest to 1.16.0.

  • Update mise@latest to 2025.10.16.

[2.62.36] - 2025-10-23

  • Update syft@latest to 1.36.0.

  • Update vacuum@latest to 0.19.0.

  • Update mise@latest to 2025.10.15.

[2.62.35] - 2025-10-22

  • Update wasmtime@latest to 38.0.2.

  • Update cargo-nextest@latest to 0.9.108.

  • Update mise@latest to 2025.10.14.

  • Update vacuum@latest to 0.18.9.

... (truncated)

Commits
  • c5b1b6f Release 2.62.38
  • 7cd74f6 Update coreutils@latest to 0.3.0
  • def9901 Update wasmtime@latest to 38.0.3
  • a9d3853 Update coreutils manifest
  • 958d48b Update mise@latest to 2025.10.17
  • fb48599 Update cargo-tarpaulin@latest to 0.34.1
  • 1c7b1d3 Release 2.62.37
  • 18cba62 Update cargo-binstall@latest to 1.15.8
  • f3c0c69 Update zizmor@latest to 1.16.0
  • 99fc3e5 Update mise@latest to 2025.10.16
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=taiki-e/install-action&package-manager=github_actions&previous-version=2.62.36&new-version=2.62.38)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/audit.yml | 2 +- .github/workflows/rust.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index ac8d6ed6f993..3685bb2f9a78 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Install cargo-audit - uses: taiki-e/install-action@ebb229c6baa68383264f2822689b07b4916d9177 # v2.62.36 + uses: taiki-e/install-action@c5b1b6f479c32f356cc6f4ba672a47f63853b13b # v2.62.38 with: tool: cargo-audit - name: Run audit check diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e9606e15c4ec..4b61a04bfb14 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -425,7 +425,7 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@ebb229c6baa68383264f2822689b07b4916d9177 # v2.62.36 + uses: taiki-e/install-action@c5b1b6f479c32f356cc6f4ba672a47f63853b13b # v2.62.38 with: tool: wasm-pack - name: Run tests with headless mode @@ -752,7 +752,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@ebb229c6baa68383264f2822689b07b4916d9177 # v2.62.36 + uses: taiki-e/install-action@c5b1b6f479c32f356cc6f4ba672a47f63853b13b # v2.62.38 with: tool: cargo-msrv From b76f9eb4703f2f77e20ffaabd7d3db9bf0493ba2 Mon Sep 17 00:00:00 2001 From: Martin <57065083+sm4rtm4art@users.noreply.github.com> Date: Mon, 27 Oct 2025 13:43:48 +0100 Subject: [PATCH 017/157] "Gentle Introduction to Arrow / Record Batches" #11336 (#18051) ## Which issue does this PR close? - Closes #11336 Since this is my first contribution, I suppose to mention @alamb , author of the Issue #11336 Could you please trigger the CI? Thanks! ## Rationale for this change The Arrow introduction guide (#11336) needed improvements to make it more accessible for newcomers while providing better navigation to advanced topics. ## What changes are included in this PR? Issue #11336 requested a gentle introduction to Apache Arrow and RecordBatches to help DataFusion users understand the foundational concepts. This PR enhances the existing Arrow introduction guide with clearer explanations, practical examples, visual aids, and comprehensive navigation links to make it more accessible for newcomers while providing pathways to advanced topics. Was unsure if this fits to `docs/source/user-guide/dataframe.md' ## Are these changes tested? applyed prettier, like described. ## Are there any user-facing changes? Yes - improved documentation for the Arrow introduction guide at `docs/source/user-guide/arrow-introduction.md` --------- Co-authored-by: Martin Co-authored-by: Andrew Lamb --- datafusion/core/src/lib.rs | 31 ++- docs/source/index.rst | 1 + docs/source/user-guide/arrow-introduction.md | 255 +++++++++++++++++++ docs/source/user-guide/dataframe.md | 2 + 4 files changed, 288 insertions(+), 1 deletion(-) create mode 100644 docs/source/user-guide/arrow-introduction.md diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index e7ace544a11c..78db28eaacc7 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -443,7 +443,30 @@ //! other operators read a single [`RecordBatch`] from their input to produce a //! single [`RecordBatch`] as output. //! -//! For example, given this SQL query: +//! For example, given this SQL: +//! +//! ```sql +//! SELECT name FROM 'data.parquet' WHERE id > 10 +//! ``` +//! +//! An simplified DataFusion execution plan is shown below. It first reads +//! data from the Parquet file, then applies the filter, then the projection, +//! and finally produces output. Each step processes one [`RecordBatch`] at a +//! time. Multiple batches are processed concurrently on different CPU cores +//! for plans with multiple partitions. +//! +//! ```text +//! ┌─────────────┐ ┌──────────────┐ ┌────────────────┐ ┌──────────────────┐ ┌──────────┐ +//! │ Parquet │───▶│ DataSource │───▶│ FilterExec │───▶│ ProjectionExec │───▶│ Results │ +//! │ File │ │ │ │ │ │ │ │ │ +//! └─────────────┘ └──────────────┘ └────────────────┘ └──────────────────┘ └──────────┘ +//! (reads data) (id > 10) (keeps "name" col) +//! RecordBatch ───▶ RecordBatch ────▶ RecordBatch ────▶ RecordBatch +//! ``` +//! +//! DataFusion uses the classic "pull" based control flow (explained more in the +//! next section) to implement streaming execution. As an example, +//! consider the following SQL query: //! //! ```sql //! SELECT date_trunc('month', time) FROM data WHERE id IN (10,20,30); @@ -897,6 +920,12 @@ doc_comment::doctest!("../../../README.md", readme_example_test); // For example, if `user_guide_expressions(line 123)` fails, // go to `docs/source/user-guide/expressions.md` to find the relevant problem. // +#[cfg(doctest)] +doc_comment::doctest!( + "../../../docs/source/user-guide/arrow-introduction.md", + user_guide_arrow_introduction +); + #[cfg(doctest)] doc_comment::doctest!( "../../../docs/source/user-guide/concepts-readings-events.md", diff --git a/docs/source/index.rst b/docs/source/index.rst index 6bb3c9485b71..b589c9ce4047 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -118,6 +118,7 @@ To get started, see user-guide/crate-configuration user-guide/cli/index user-guide/dataframe + user-guide/arrow-introduction user-guide/expressions user-guide/sql/index user-guide/configs diff --git a/docs/source/user-guide/arrow-introduction.md b/docs/source/user-guide/arrow-introduction.md new file mode 100644 index 000000000000..89662a0c29c5 --- /dev/null +++ b/docs/source/user-guide/arrow-introduction.md @@ -0,0 +1,255 @@ + + +# Gentle Arrow Introduction + +```{contents} +:local: +:depth: 2 +``` + +## Overview + +DataFusion uses [Apache Arrow] as its native in-memory format, so anyone using DataFusion will likely interact with Arrow at some point. This guide introduces the key Arrow concepts you need to know to effectively use DataFusion. + +Apache Arrow defines a standardized columnar representation for in-memory data. This enables different systems and languages (e.g., Rust and Python) to share data with zero-copy interchange, avoiding serialization overhead. In addition to zero copy interchange, Arrow also standardizes best practice columnar data representation enabling high performance analytical processing through vectorized execution. + +## Columnar Layout + +Quick visual: row-major (left) vs Arrow's columnar layout (right). For a deeper primer, see the [arrow2 guide]. + +```text +Traditional Row Storage: Arrow Columnar Storage: +┌──────────────────┐ ┌─────────┬─────────┬──────────┐ +│ id │ name │ age │ │ id │ name │ age │ +├────┼──────┼──────┤ ├─────────┼─────────┼──────────┤ +│ 1 │ A │ 30 │ │ [1,2,3] │ [A,B,C] │[30,25,35]│ +│ 2 │ B │ 25 │ └─────────┴─────────┴──────────┘ +│ 3 │ C │ 35 │ ↑ ↑ ↑ +└──────────────────┘ Int32Array StringArray Int32Array +(read entire rows) (process entire columns at once) +``` + +## `RecordBatch` + +Arrow's standard unit for packaging data is the **[`RecordBatch`]**. + +A **[`RecordBatch`]** represents a horizontal slice of a table—a collection of equal-length columnar arrays that conform to a defined schema. Each column within the slice is a contiguous Arrow array, and all columns have the same number of rows (length). This chunked, immutable unit enables efficient streaming and parallel execution. + +Think of it as having two perspectives: + +- **Columnar inside**: Each column (`id`, `name`, `age`) is a contiguous array optimized for vectorized operations +- **Row-chunked externally**: The batch represents a chunk of rows (e.g., rows 1-1000), making it a manageable unit for streaming + +RecordBatches are **immutable snapshots**—once created, they cannot be modified. Any transformation produces a _new_ RecordBatch, enabling safe parallel processing without locks or coordination overhead. + +This design allows DataFusion to process streams of row-based chunks while gaining maximum performance from the columnar layout. + +## Streaming Through the Engine + +DataFusion processes queries as pull-based pipelines where operators request batches from their inputs. This streaming approach enables early result production, bounds memory usage (spilling to disk only when necessary), and naturally supports parallel execution across multiple CPU cores. + +For example, given the following query: + +```sql +SELECT name FROM 'data.parquet' WHERE id > 10 +``` + +The DataFusion Pipeline looks like this: + +```text + +┌─────────────┐ ┌──────────────┐ ┌────────────────┐ ┌──────────────────┐ ┌──────────┐ +│ Parquet │───▶│ Scan │───▶│ Filter │───▶│ Projection │───▶│ Results │ +│ File │ │ Operator │ │ Operator │ │ Operator │ │ │ +└─────────────┘ └──────────────┘ └────────────────┘ └──────────────────┘ └──────────┘ + (reads data) (id > 10) (keeps "name" col) + RecordBatch ───▶ RecordBatch ────▶ RecordBatch ────▶ RecordBatch +``` + +In this pipeline, [`RecordBatch`]es are the "packages" of columnar data that flow between the different stages of query execution. Each operator processes batches incrementally, enabling the system to produce results before reading the entire input. + +## Creating `ArrayRef` and `RecordBatch`es + +Sometimes you need to create Arrow data programmatically rather than reading from files. + +The first thing needed is creating an Arrow Array, for each column. [arrow-rs] provides array builders and `From` impls to create arrays from Rust vectors. + +```rust +use arrow::array::{StringArray, Int32Array}; +// Create an Int32Array from a vector of i32 values +let ids = Int32Array::from(vec![1, 2, 3]); +// There are similar constructors for other array types, e.g., StringArray, Float64Array, etc. +let names = StringArray::from(vec![Some("alice"), None, Some("carol")]); +``` + +Every element in an Arrow array can be "null" (aka missing). Often, arrays are +created from `Option` values to indicate nullability (e.g., `Some("alice")` +vs `None` above). + +Note: You'll see [`Arc`] used frequently in the code—Arrow arrays are wrapped in +[`Arc`] (atomically reference-counted pointers) to enable cheap, thread-safe +sharing across operators and tasks. [`ArrayRef`] is simply a type alias for +`Arc`. To create an `ArrayRef`, wrap your array in `Arc::new(...)` as shown below. + +```rust +use std::sync::Arc; +# use arrow::array::{ArrayRef, Int32Array, StringArray}; +// To get an ArrayRef, wrap the Int32Array in an Arc. +// (note you will often have to explicitly type annotate to ArrayRef) +let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); + +// you can also store Strings and other types in ArrayRefs +let arr: ArrayRef = Arc::new( + StringArray::from(vec![Some("alice"), None, Some("carol")]) +); +``` + +To create a [`RecordBatch`], you need to define its [`Schema`] (the column names and types) and provide the corresponding columns as [`ArrayRef`]s as shown below: + +```rust +# use std::sync::Arc; +# use arrow_schema::ArrowError; +# use arrow::array::{ArrayRef, Int32Array, StringArray, RecordBatch}; +use arrow_schema::{DataType, Field, Schema}; + +// Create the columns as Arrow arrays +let ids = Int32Array::from(vec![1, 2, 3]); +let names = StringArray::from(vec![Some("alice"), None, Some("carol")]); +// Create the schema +let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), // false means non-nullable + Field::new("name", DataType::Utf8, true), // true means nullable +])); +// Assemble the columns +let cols: Vec = vec![ + Arc::new(ids), + Arc::new(names) +]; +// Finally, create the RecordBatch +RecordBatch::try_new(schema, cols).expect("Failed to create RecordBatch"); +``` + +## Working with `ArrayRef` and `RecordBatch` + +Most DataFusion APIs are in terms of [`ArrayRef`] and [`RecordBatch`]. To work with the +underlying data, you typically downcast the [`ArrayRef`] to its concrete type +(e.g., [`Int32Array`]). + +To do so either use the `as_any().downcast_ref::()` method or the +`as_::()` helper method from the [AsArray] trait. + +[asarray]: https://docs.rs/arrow-array/latest/arrow_array/cast/trait.AsArray.html + +```rust +# use std::sync::Arc; +# use arrow::datatypes::{DataType, Int32Type}; +# use arrow::array::{AsArray, ArrayRef, Int32Array, RecordBatch}; +# let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); +// First check the data type of the array +match arr.data_type() { + &DataType::Int32 => { + // Downcast to Int32Array + let int_array = arr.as_primitive::(); + // Now you can access Int32Array methods + for i in 0..int_array.len() { + println!("Value at index {}: {}", i, int_array.value(i)); + } + } + _ => { + println ! ("Array is not of type Int32"); + } +} +``` + +The following two downcasting methods are equivalent: + +```rust +# use std::sync::Arc; +# use arrow::datatypes::{DataType, Int32Type}; +# use arrow::array::{AsArray, ArrayRef, Int32Array, RecordBatch}; +# let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); +// Downcast to Int32Array using as_any +let int_array1 = arr.as_any().downcast_ref::().unwrap(); +// This is the same as using the as_::() helper +let int_array2 = arr.as_primitive::(); +assert_eq!(int_array1, int_array2); +``` + +## Common Pitfalls + +When working with Arrow and RecordBatches, watch out for these common issues: + +- **Schema consistency**: All batches in a stream must share the exact same [`Schema`]. For example, you can't have one batch where a column is [`Int32`] and the next where it's [`Int64`], even if the values would fit +- **Immutability**: Arrays are immutable—to "modify" data, you must build new arrays or new RecordBatches. For instance, to change a value in an array, you'd create a new array with the updated value +- **Row by Row Processing**: Avoid iterating over Arrays element by element when possible, and use Arrow's built-in [compute kernels] instead +- **Type mismatches**: Mixed input types across files may require explicit casts. For example, a string column `"123"` from a CSV file won't automatically join with an integer column `123` from a Parquet file—you'll need to cast one to match the other. Use Arrow's [`cast`] kernel where appropriate +- **Batch size assumptions**: Don't assume a particular batch size; always iterate until the stream ends. One file might produce 8192-row batches while another produces 1024-row batches + +[compute kernels]: https://docs.rs/arrow/latest/arrow/compute/index.html + +## Further reading + +**Arrow Documentation:** + +- [Arrow Format Introduction](https://arrow.apache.org/docs/format/Intro.html) - Understand the Arrow specification and why it enables zero-copy data sharing +- [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) - Deep dive into memory layout for performance optimization +- [Arrow Rust Documentation](https://docs.rs/arrow/latest/arrow/) - Complete API reference for the Rust implementation + +**Key API References:** + +- [RecordBatch](https://docs.rs/arrow-array/latest/arrow_array/struct.RecordBatch.html) - The fundamental data structure for columnar data (a table slice) +- [ArrayRef](https://docs.rs/arrow-array/latest/arrow_array/array/type.ArrayRef.html) - Represents a reference-counted Arrow array (single column) +- [DataType](https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html) - Enum of all supported Arrow data types (e.g., Int32, Utf8) +- [Schema](https://docs.rs/arrow-schema/latest/arrow_schema/struct.Schema.html) - Describes the structure of a RecordBatch (column names and types) + +[apache arrow]: https://arrow.apache.org/docs/index.html +[`arc`]: https://doc.rust-lang.org/std/sync/struct.Arc.html +[`arrayref`]: https://docs.rs/arrow-array/latest/arrow_array/array/type.ArrayRef.html +[`cast`]: https://docs.rs/arrow/latest/arrow/compute/fn.cast.html +[`field`]: https://docs.rs/arrow-schema/latest/arrow_schema/struct.Field.html +[`schema`]: https://docs.rs/arrow-schema/latest/arrow_schema/struct.Schema.html +[`datatype`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html +[`int32array`]: https://docs.rs/arrow-array/latest/arrow_array/array/struct.Int32Array.html +[`stringarray`]: https://docs.rs/arrow-array/latest/arrow_array/array/struct.StringArray.html +[`int32`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html#variant.Int32 +[`int64`]: https://docs.rs/arrow-schema/latest/arrow_schema/enum.DataType.html#variant.Int64 +[extension points]: ../library-user-guide/extensions.md +[`tableprovider`]: https://docs.rs/datafusion/latest/datafusion/datasource/trait.TableProvider.html +[custom table providers guide]: ../library-user-guide/custom-table-providers.md +[user-defined functions (udfs)]: ../library-user-guide/functions/adding-udfs.md +[custom optimizer rules and physical operators]: ../library-user-guide/extending-operators.md +[`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html +[`.register_table()`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.register_table +[`.sql()`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.sql +[`.show()`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.show +[`memtable`]: https://docs.rs/datafusion/latest/datafusion/datasource/struct.MemTable.html +[`sessioncontext`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html +[`csvreadoptions`]: https://docs.rs/datafusion/latest/datafusion/execution/options/struct.CsvReadOptions.html +[`parquetreadoptions`]: https://docs.rs/datafusion/latest/datafusion/execution/options/struct.ParquetReadOptions.html +[`recordbatch`]: https://docs.rs/arrow-array/latest/arrow_array/struct.RecordBatch.html +[`read_csv`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_csv +[`read_parquet`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_parquet +[`read_json`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_json +[`read_avro`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.read_avro +[`dataframe`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html +[`.collect()`]: https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.collect +[arrow2 guide]: https://jorgecarleitao.github.io/arrow2/main/guide/arrow.html#what-is-apache-arrow +[configuration settings]: configs.md +[`datafusion.execution.batch_size`]: configs.md#setting-configuration-options diff --git a/docs/source/user-guide/dataframe.md b/docs/source/user-guide/dataframe.md index 82f1eeb2823d..85724a72399a 100644 --- a/docs/source/user-guide/dataframe.md +++ b/docs/source/user-guide/dataframe.md @@ -19,6 +19,8 @@ # DataFrame API +## DataFrame overview + A DataFrame represents a logical set of rows with the same named columns, similar to a [Pandas DataFrame] or [Spark DataFrame]. From bdf346eef744e6db2818286977f7b3dc609d434b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Oct 2025 08:57:08 -0400 Subject: [PATCH 018/157] chore(deps): bump regex from 1.11.3 to 1.12.2 (#18294) Bumps [regex](https://github.com/rust-lang/regex) from 1.11.3 to 1.12.2.
Changelog

Sourced from regex's changelog.

1.12.2 (2025-10-13)

This release fixes a cargo doc breakage on nightly when --cfg docsrs is enabled. This caused documentation to fail to build on docs.rs.

Bug fixes:

1.12.1 (2025-10-10)

This release makes a bug fix in the new regex::Captures::get_match API introduced in 1.12.0. There was an oversight with the lifetime parameter for the Match returned. This is technically a breaking change, but given that it was caught almost immediately and I've yanked the 1.12.0 release, I think this is fine.

1.12.0 (2025-10-10)

This release contains a smattering of bug fixes, a fix for excessive memory consumption in some cases and a new regex::Captures::get_match API.

Improvements:

Bug fixes:

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=regex&package-manager=cargo&previous-version=1.11.3&new-version=1.12.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 735738338c3d..d99e20fc7cab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5253,9 +5253,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.3" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -5265,9 +5265,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.11" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", diff --git a/Cargo.toml b/Cargo.toml index 3e0861c07ab0..98268737eb99 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -172,7 +172,7 @@ insta = { version = "1.43.2", features = ["glob", "filters"] } prost = "0.13.1" rand = "0.9" recursive = "0.1.1" -regex = "1.11" +regex = "1.12" rstest = "0.25.0" serde_json = "1" sqlparser = { version = "0.59.0", default-features = false, features = ["std", "visitor"] } From 98b11c07eec4527b7c63f7b1407dc80d1b8417d1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Oct 2025 12:57:36 +0000 Subject: [PATCH 019/157] chore(deps): bump clap from 4.5.48 to 4.5.50 (#18292) Bumps [clap](https://github.com/clap-rs/clap) from 4.5.48 to 4.5.50.
Release notes

Sourced from clap's releases.

v4.5.50

[4.5.50] - 2025-10-20

Features

  • Accept Cow where String and &str are accepted
Changelog

Sourced from clap's changelog.

[4.5.50] - 2025-10-20

Features

  • Accept Cow where String and &str are accepted

[4.5.49] - 2025-10-13

Fixes

  • (help) Correctly wrap when ANSI escape codes are present
Commits
  • d8acd47 chore: Release
  • 7c2b8d9 docs: Update changelog
  • e69a2ea Merge pull request #5987 from mernen/fix-bash-comp-words-loop
  • e03cc2e Merge pull request #5988 from cordx56/fix-builder-custom-version-docs
  • 5ab2579 fix: Minor fix for builder docs about version
  • 2f66432 fix(complete): Only parse arguments before current
  • 4d9d210 test(complete): Illustrate current behavior in Bash
  • 6abe2f8 chore: Release
  • d5c7454 docs: Update changelog
  • 5b2e960 Merge pull request #5985 from mernen/bash-cur
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=clap&package-manager=cargo&previous-version=4.5.48&new-version=4.5.50)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 20 ++++++++++---------- datafusion-cli/Cargo.toml | 2 +- datafusion/sqllogictest/Cargo.toml | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d99e20fc7cab..f214c48b278a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1388,9 +1388,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.48" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae" +checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623" dependencies = [ "clap_builder", "clap_derive", @@ -1398,9 +1398,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.48" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9" +checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0" dependencies = [ "anstream", "anstyle", @@ -1410,9 +1410,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.47" +version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -1589,7 +1589,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.48", + "clap 4.5.50", "criterion-plot", "futures", "is-terminal", @@ -1912,7 +1912,7 @@ dependencies = [ "aws-config", "aws-credential-types", "chrono", - "clap 4.5.48", + "clap 4.5.50", "ctor", "datafusion", "datafusion-common", @@ -2635,7 +2635,7 @@ dependencies = [ "bigdecimal", "bytes", "chrono", - "clap 4.5.48", + "clap 4.5.50", "datafusion", "datafusion-spark", "datafusion-substrait", @@ -3979,7 +3979,7 @@ checksum = "5297962ef19edda4ce33aaa484386e0a5b3d7f2f4e037cbeee00503ef6b29d33" dependencies = [ "anstream", "anstyle", - "clap 4.5.48", + "clap 4.5.50", "escape8259", ] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 53744e6c609b..f3069b492352 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -40,7 +40,7 @@ async-trait = { workspace = true } aws-config = "1.8.7" aws-credential-types = "1.2.7" chrono = { workspace = true } -clap = { version = "4.5.47", features = ["cargo", "derive"] } +clap = { version = "4.5.50", features = ["cargo", "derive"] } datafusion = { workspace = true, features = [ "avro", "compression", diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index d02d5f9cb5e4..8ab3932e8433 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -42,7 +42,7 @@ async-trait = { workspace = true } bigdecimal = { workspace = true } bytes = { workspace = true, optional = true } chrono = { workspace = true, optional = true } -clap = { version = "4.5.47", features = ["derive", "env"] } +clap = { version = "4.5.50", features = ["derive", "env"] } datafusion = { workspace = true, default-features = true, features = ["avro", "parquet_encryption"] } datafusion-spark = { workspace = true, default-features = true } datafusion-substrait = { workspace = true, default-features = true } From ac8954528a40952cbd94de74b0cb84466bbe83fe Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 27 Oct 2025 10:16:48 -0400 Subject: [PATCH 020/157] Upgrade DataFusion to arrow/parquet 57.0.0 (#17888) ## Which issue does this PR close? - Related to https://github.com/apache/arrow-rs/issues/7835 - Closes #3666 Note while this PR looks massive, a large portion is display updates due to better display of Fields and DataTypes ## Rationale for this change Upgrade to the latest arrow Also, there are several new features in arrow-57 that I want to be able to test including Variant, arrow-avro, and a new parquet metadata reader. ## What changes are included in this PR? 1. Update arrow/parquet 2. Update prost 3. Update substrait 4. Update pbjson 5. Make API changes to avoid deprecated APIs ## Are these changes tested? By CI ## Are there any user-facing changes? New arrow --- Cargo.lock | 319 ++++------- Cargo.toml | 22 +- datafusion-cli/src/functions.rs | 4 +- datafusion-cli/src/main.rs | 18 +- datafusion-examples/Cargo.toml | 2 +- .../examples/flight/flight_client.rs | 5 +- .../examples/flight/flight_server.rs | 5 +- .../examples/parquet_encrypted.rs | 8 +- .../examples/parquet_encrypted_with_kms.rs | 4 +- datafusion/common/Cargo.toml | 2 +- datafusion/common/src/config.rs | 33 +- datafusion/common/src/dfschema.rs | 2 +- datafusion/common/src/encryption.rs | 32 +- .../common/src/file_options/parquet_writer.rs | 22 +- datafusion/common/src/pyarrow.rs | 26 +- datafusion/core/benches/parquet_query_sql.rs | 5 +- datafusion/core/src/dataframe/parquet.rs | 5 +- .../src/datasource/file_format/parquet.rs | 83 +-- datafusion/core/src/physical_planner.rs | 26 +- .../tests/dataframe/dataframe_functions.rs | 20 +- datafusion/core/tests/dataframe/mod.rs | 54 +- datafusion/core/tests/parquet/encryption.rs | 6 +- .../core/tests/parquet/filter_pushdown.rs | 13 +- .../physical_optimizer/enforce_sorting.rs | 75 ++- .../enforce_sorting_monotonicity.rs | 504 +++++++++--------- .../physical_optimizer/sanity_checker.rs | 4 +- datafusion/core/tests/sql/select.rs | 10 +- .../datasource-parquet/src/file_format.rs | 101 ++-- datafusion/datasource-parquet/src/metadata.rs | 7 +- datafusion/datasource-parquet/src/opener.rs | 5 +- .../datasource-parquet/src/page_filter.rs | 2 +- datafusion/datasource-parquet/src/reader.rs | 5 +- datafusion/datasource-parquet/src/source.rs | 8 +- .../execution/src/parquet_encryption.rs | 4 +- .../functions-aggregate-common/src/utils.rs | 10 +- datafusion/functions/src/core/arrow_cast.rs | 28 +- datafusion/functions/src/datetime/date_bin.rs | 2 +- .../optimizer/src/analyzer/type_coercion.rs | 10 +- .../src/decorrelate_predicate_subquery.rs | 12 +- .../physical-expr/src/expressions/cast.rs | 4 +- .../src/expressions/dynamic_filters.rs | 4 +- .../src/windows/bounded_window_agg_exec.rs | 4 +- datafusion/proto-common/src/to_proto/mod.rs | 13 +- datafusion/proto/src/bytes/mod.rs | 2 +- datafusion/sql/tests/cases/params.rs | 8 +- datafusion/sql/tests/sql_integration.rs | 36 +- datafusion/sqllogictest/test_files/array.slt | 42 +- .../sqllogictest/test_files/arrow_typeof.slt | 22 +- datafusion/sqllogictest/test_files/case.slt | 2 +- .../sqllogictest/test_files/coalesce.slt | 6 +- .../test_files/count_star_rule.slt | 2 +- .../test_files/current_time_timezone.slt | 4 +- datafusion/sqllogictest/test_files/dates.slt | 7 +- datafusion/sqllogictest/test_files/ddl.slt | 2 +- .../sqllogictest/test_files/describe.slt | 2 +- .../sqllogictest/test_files/dictionary.slt | 4 +- .../test_files/expr/date_part.slt | 4 +- .../sqllogictest/test_files/float16.slt | 20 +- .../sqllogictest/test_files/group_by.slt | 6 +- .../test_files/information_schema_columns.slt | 2 +- datafusion/sqllogictest/test_files/insert.slt | 6 +- .../test_files/insert_to_external.slt | 4 +- .../sqllogictest/test_files/interval.slt | 6 +- .../sqllogictest/test_files/join_lists.slt | 1 - datafusion/sqllogictest/test_files/joins.slt | 32 +- datafusion/sqllogictest/test_files/map.slt | 4 +- .../sqllogictest/test_files/parquet.slt | 8 +- datafusion/sqllogictest/test_files/pwmj.slt | 2 +- .../sqllogictest/test_files/qualify.slt | 8 +- .../test_files/spark/array/shuffle.slt | 2 - datafusion/sqllogictest/test_files/struct.slt | 38 +- .../sqllogictest/test_files/subquery_sort.slt | 4 +- .../sqllogictest/test_files/timestamps.slt | 176 +++--- .../sqllogictest/test_files/type_coercion.slt | 2 +- datafusion/sqllogictest/test_files/union.slt | 2 +- datafusion/sqllogictest/test_files/unnest.slt | 10 +- datafusion/sqllogictest/test_files/window.slt | 258 ++++----- .../sqllogictest/test_files/window_limits.slt | 24 +- datafusion/substrait/Cargo.toml | 2 +- docs/source/library-user-guide/upgrading.md | 9 + docs/source/user-guide/sql/data_types.md | 13 +- .../source/user-guide/sql/scalar_functions.md | 28 +- 82 files changed, 1138 insertions(+), 1200 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f214c48b278a..55c334e157db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -225,9 +225,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" +checksum = "4df8bb5b0bd64c0b9bc61317fcc480bad0f00e56d3bc32c69a4c8dada4786bae" dependencies = [ "arrow-arith", "arrow-array", @@ -249,23 +249,23 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" +checksum = "a1a640186d3bd30a24cb42264c2dafb30e236a6f50d510e56d40b708c9582491" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "num", + "num-traits", ] [[package]] name = "arrow-array" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" +checksum = "219fe420e6800979744c8393b687afb0252b3f8a89b91027d27887b72aa36d31" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -275,25 +275,28 @@ dependencies = [ "chrono-tz", "half", "hashbrown 0.16.0", - "num", + "num-complex", + "num-integer", + "num-traits", ] [[package]] name = "arrow-buffer" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" +checksum = "76885a2697a7edf6b59577f568b456afc94ce0e2edc15b784ce3685b6c3c5c27" dependencies = [ "bytes", "half", - "num", + "num-bigint", + "num-traits", ] [[package]] name = "arrow-cast" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" +checksum = "9c9ebb4c987e6b3b236fb4a14b20b34835abfdd80acead3ccf1f9bf399e1f168" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,15 +309,15 @@ dependencies = [ "comfy-table", "half", "lexical-core", - "num", + "num-traits", "ryu", ] [[package]] name = "arrow-csv" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" +checksum = "92386159c8d4bce96f8bd396b0642a0d544d471bdc2ef34d631aec80db40a09c" dependencies = [ "arrow-array", "arrow-cast", @@ -327,21 +330,22 @@ dependencies = [ [[package]] name = "arrow-data" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" +checksum = "727681b95de313b600eddc2a37e736dcb21980a40f640314dcf360e2f36bc89b" dependencies = [ "arrow-buffer", "arrow-schema", "half", - "num", + "num-integer", + "num-traits", ] [[package]] name = "arrow-flight" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c8b0ba0784d56bc6266b79f5de7a24b47024e7b3a0045d2ad4df3d9b686099f" +checksum = "f70bb56412a007b0cfc116d15f24dda6adeed9611a213852a004cda20085a3b9" dependencies = [ "arrow-arith", "arrow-array", @@ -359,16 +363,17 @@ dependencies = [ "futures", "once_cell", "paste", - "prost 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-types", "tonic", + "tonic-prost", ] [[package]] name = "arrow-ipc" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" +checksum = "da9ba92e3de170295c98a84e5af22e2b037f0c7b32449445e6c493b5fca27f27" dependencies = [ "arrow-array", "arrow-buffer", @@ -382,9 +387,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" +checksum = "b969b4a421ae83828591c6bf5450bd52e6d489584142845ad6a861f42fe35df8" dependencies = [ "arrow-array", "arrow-buffer", @@ -394,19 +399,21 @@ dependencies = [ "chrono", "half", "indexmap 2.12.0", + "itoa", "lexical-core", "memchr", - "num", - "serde", + "num-traits", + "ryu", + "serde_core", "serde_json", "simdutf8", ] [[package]] name = "arrow-ord" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" +checksum = "141c05298b21d03e88062317a1f1a73f5ba7b6eb041b350015b1cd6aabc0519b" dependencies = [ "arrow-array", "arrow-buffer", @@ -417,9 +424,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d924b32e96f8bb74d94cd82bd97b313c432fcb0ea331689ef9e7c6b8be4b258" +checksum = "cfcfb2be2e9096236f449c11f425cddde18c4cc540f516d90f066f10a29ed515" dependencies = [ "arrow-array", "arrow-data", @@ -429,9 +436,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" +checksum = "c5f3c06a6abad6164508ed283c7a02151515cef3de4b4ff2cebbcaeb85533db2" dependencies = [ "arrow-array", "arrow-buffer", @@ -442,34 +449,35 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" +checksum = "9cfa7a03d1eee2a4d061476e1840ad5c9867a544ca6c4c59256496af5d0a8be5" dependencies = [ "bitflags 2.9.4", "serde", + "serde_core", "serde_json", ] [[package]] name = "arrow-select" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" +checksum = "bafa595babaad59f2455f4957d0f26448fb472722c186739f4fac0823a1bdb47" dependencies = [ "ahash 0.8.12", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", - "num", + "num-traits", ] [[package]] name = "arrow-string" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" +checksum = "32f46457dbbb99f2650ff3ac23e46a929e0ab81db809b02aa5511c258348bef2" dependencies = [ "arrow-array", "arrow-buffer", @@ -477,7 +485,7 @@ dependencies = [ "arrow-schema", "arrow-select", "memchr", - "num", + "num-traits", "regex", "regex-syntax", ] @@ -2143,7 +2151,7 @@ dependencies = [ "mimalloc", "nix", "object_store", - "prost 0.13.5", + "prost", "rand 0.9.2", "serde_json", "tempfile", @@ -2229,7 +2237,7 @@ dependencies = [ "doc-comment", "futures", "log", - "prost 0.13.5", + "prost", "semver", "tokio", ] @@ -2532,7 +2540,7 @@ dependencies = [ "object_store", "pbjson", "pretty_assertions", - "prost 0.13.5", + "prost", "serde", "serde_json", "tokio", @@ -2546,7 +2554,7 @@ dependencies = [ "datafusion-common", "doc-comment", "pbjson", - "prost 0.13.5", + "prost", "serde", ] @@ -2674,7 +2682,7 @@ dependencies = [ "itertools 0.14.0", "object_store", "pbjson-types", - "prost 0.13.5", + "prost", "serde_json", "substrait", "tokio", @@ -3157,16 +3165,16 @@ dependencies = [ name = "gen" version = "0.1.0" dependencies = [ - "pbjson-build 0.8.0", - "prost-build 0.14.1", + "pbjson-build", + "prost-build", ] [[package]] name = "gen-common" version = "0.1.0" dependencies = [ - "pbjson-build 0.8.0", - "prost-build 0.14.1", + "pbjson-build", + "prost-build", ] [[package]] @@ -3506,7 +3514,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.0", + "socket2", "tokio", "tower-service", "tracing", @@ -4184,20 +4192,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "num" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" -dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", -] - [[package]] name = "num-bigint" version = "0.4.6" @@ -4233,28 +4227,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-iter" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - [[package]] name = "num-traits" version = "0.2.19" @@ -4397,9 +4369,9 @@ dependencies = [ [[package]] name = "parquet" -version = "56.2.0" +version = "57.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" +checksum = "7a0f31027ef1af7549f7cec603a9a21dce706d3f8d7c2060a68f43c1773be95a" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -4418,8 +4390,9 @@ dependencies = [ "half", "hashbrown 0.16.0", "lz4_flex", - "num", "num-bigint", + "num-integer", + "num-traits", "object_store", "paste", "ring", @@ -4465,26 +4438,14 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pbjson" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7e6349fa080353f4a597daffd05cb81572a9c031a6d4fff7e504947496fcc68" +checksum = "898bac3fa00d0ba57a4e8289837e965baa2dee8c3749f3b11d45a64b4223d9c3" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "serde", ] -[[package]] -name = "pbjson-build" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" -dependencies = [ - "heck 0.5.0", - "itertools 0.13.0", - "prost 0.13.5", - "prost-types 0.13.5", -] - [[package]] name = "pbjson-build" version = "0.8.0" @@ -4493,22 +4454,22 @@ checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095" dependencies = [ "heck 0.5.0", "itertools 0.14.0", - "prost 0.14.1", - "prost-types 0.14.1", + "prost", + "prost-types", ] [[package]] name = "pbjson-types" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e54e5e7bfb1652f95bc361d76f3c780d8e526b134b85417e774166ee941f0887" +checksum = "8e748e28374f10a330ee3bb9f29b828c0ac79831a32bab65015ad9b661ead526" dependencies = [ "bytes", "chrono", "pbjson", - "pbjson-build 0.7.0", - "prost 0.13.5", - "prost-build 0.13.5", + "pbjson-build", + "prost", + "prost-build", "serde", ] @@ -4787,16 +4748,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "prost" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" -dependencies = [ - "bytes", - "prost-derive 0.13.5", -] - [[package]] name = "prost" version = "0.14.1" @@ -4804,27 +4755,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7231bd9b3d3d33c86b58adbac74b5ec0ad9f496b19d22801d773636feaa95f3d" dependencies = [ "bytes", - "prost-derive 0.14.1", -] - -[[package]] -name = "prost-build" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" -dependencies = [ - "heck 0.5.0", - "itertools 0.14.0", - "log", - "multimap", - "once_cell", - "petgraph 0.7.1", - "prettyplease", - "prost 0.13.5", - "prost-types 0.13.5", - "regex", - "syn 2.0.106", - "tempfile", + "prost-derive", ] [[package]] @@ -4840,26 +4771,13 @@ dependencies = [ "once_cell", "petgraph 0.7.1", "prettyplease", - "prost 0.14.1", - "prost-types 0.14.1", + "prost", + "prost-types", "regex", "syn 2.0.106", "tempfile", ] -[[package]] -name = "prost-derive" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" -dependencies = [ - "anyhow", - "itertools 0.14.0", - "proc-macro2", - "quote", - "syn 2.0.106", -] - [[package]] name = "prost-derive" version = "0.14.1" @@ -4873,22 +4791,13 @@ dependencies = [ "syn 2.0.106", ] -[[package]] -name = "prost-types" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" -dependencies = [ - "prost 0.13.5", -] - [[package]] name = "prost-types" version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9b4db3d6da204ed77bb26ba83b6122a73aeb2e87e25fbf7ad2e84c4ccbf8f72" dependencies = [ - "prost 0.14.1", + "prost", ] [[package]] @@ -4931,9 +4840,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a" +checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383" dependencies = [ "indoc", "libc", @@ -4948,19 +4857,18 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598" +checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f" dependencies = [ - "once_cell", "target-lexicon", ] [[package]] name = "pyo3-ffi" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c" +checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105" dependencies = [ "libc", "pyo3-build-config", @@ -4968,9 +4876,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50" +checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -4980,9 +4888,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.25.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc" +checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -5020,7 +4928,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.0", + "socket2", "thiserror", "tokio", "tracing", @@ -5057,7 +4965,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.0", + "socket2", "tracing", "windows-sys 0.60.2", ] @@ -5950,16 +5858,6 @@ dependencies = [ "cmake", ] -[[package]] -name = "socket2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - [[package]] name = "socket2" version = "0.6.0" @@ -6149,18 +6047,18 @@ dependencies = [ [[package]] name = "substrait" -version = "0.58.0" +version = "0.59.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de6d24c270c6c672a86c183c3a8439ba46c1936f93cf7296aa692de3b0ff0228" +checksum = "540683f325ab9ab1a2008bc24588f3e76f63b6a3f52bc47e121122376a063639" dependencies = [ "heck 0.5.0", "pbjson", - "pbjson-build 0.7.0", + "pbjson-build", "pbjson-types", "prettyplease", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "protobuf-src", "regress", "schemars 0.8.22", @@ -6445,7 +6343,7 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.0", + "socket2", "tokio-macros", "windows-sys 0.61.0", ] @@ -6481,7 +6379,7 @@ dependencies = [ "postgres-protocol", "postgres-types", "rand 0.9.2", - "socket2 0.6.0", + "socket2", "tokio", "tokio-util", "whoami", @@ -6568,9 +6466,9 @@ dependencies = [ [[package]] name = "tonic" -version = "0.13.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" +checksum = "eb7613188ce9f7df5bfe185db26c5814347d110db17920415cf2fbcad85e7203" dependencies = [ "async-trait", "axum", @@ -6585,8 +6483,8 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "prost 0.13.5", - "socket2 0.5.10", + "socket2", + "sync_wrapper", "tokio", "tokio-stream", "tower", @@ -6595,6 +6493,17 @@ dependencies = [ "tracing", ] +[[package]] +name = "tonic-prost" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66bd50ad6ce1252d87ef024b3d64fe4c3cf54a86fb9ef4c631fdd0ded7aeaa67" +dependencies = [ + "bytes", + "prost", + "tonic", +] + [[package]] name = "tower" version = "0.5.2" diff --git a/Cargo.toml b/Cargo.toml index 98268737eb99..1cfb23bb183d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -91,19 +91,19 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.20", default-features = false } -arrow = { version = "56.2.0", features = [ +arrow = { version = "57.0.0", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "56.2.0", default-features = false } -arrow-flight = { version = "56.2.0", features = [ +arrow-buffer = { version = "57.0.0", default-features = false } +arrow-flight = { version = "57.0.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "56.2.0", default-features = false, features = [ +arrow-ipc = { version = "57.0.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "56.2.0", default-features = false } -arrow-schema = { version = "56.2.0", default-features = false } +arrow-ord = { version = "57.0.0", default-features = false } +arrow-schema = { version = "57.0.0", default-features = false } async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.10" @@ -156,20 +156,20 @@ half = { version = "2.7.0", default-features = false } hashbrown = { version = "0.14.5", features = ["raw"] } hex = { version = "0.4.3" } indexmap = "2.12.0" +insta = { version = "1.43.2", features = ["glob", "filters"] } itertools = "0.14" log = "^0.4" object_store = { version = "0.12.4", default-features = false } parking_lot = "0.12" -parquet = { version = "56.2.0", default-features = false, features = [ +parquet = { version = "57.0.0", default-features = false, features = [ "arrow", "async", "object_store", ] } -pbjson = { version = "0.7.0" } -pbjson-types = "0.7" +pbjson = { version = "0.8.0" } +pbjson-types = "0.8" # Should match arrow-flight's version of prost. -insta = { version = "1.43.2", features = ["glob", "filters"] } -prost = "0.13.1" +prost = "0.14.1" rand = "0.9" recursive = "0.1.1" regex = "1.12" diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs index 3ec446c51583..d23b12469e38 100644 --- a/datafusion-cli/src/functions.rs +++ b/datafusion-cli/src/functions.rs @@ -419,7 +419,9 @@ impl TableFunctionImpl for ParquetMetadataFunc { stats_max_value_arr.push(None); }; compression_arr.push(format!("{:?}", column.compression())); - encodings_arr.push(format!("{:?}", column.encodings())); + // need to collect into Vec to format + let encodings: Vec<_> = column.encodings().collect(); + encodings_arr.push(format!("{:?}", encodings)); index_page_offset_arr.push(column.index_page_offset()); dictionary_page_offset_arr.push(column.dictionary_page_offset()); data_page_offset_arr.push(column.data_page_offset()); diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index bdb2fdf5198e..09fa8ef15af8 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -497,7 +497,7 @@ mod tests { +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ - | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 | + | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [PLAIN, RLE, RLE_DICTIONARY] | | 4 | 46 | 121 | 123 | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ "#); @@ -510,7 +510,7 @@ mod tests { +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ - | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 | + | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [PLAIN, RLE, RLE_DICTIONARY] | | 4 | 46 | 121 | 123 | +-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ "#); @@ -532,7 +532,7 @@ mod tests { +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ | filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size | +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ - | ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | "String" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [BIT_PACKED, RLE, PLAIN] | | | 4 | 152 | 163 | + | ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | "String" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [PLAIN, RLE, BIT_PACKED] | | | 4 | 152 | 163 | +-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+ "#); @@ -592,9 +592,9 @@ mod tests { +-----------------------------------+-----------------+---------------------+------+------------------+ | filename | file_size_bytes | metadata_size_bytes | hits | extra | +-----------------------------------+-----------------+---------------------+------+------------------+ - | alltypes_plain.parquet | 1851 | 10181 | 2 | page_index=false | - | alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true | - | lz4_raw_compressed_larger.parquet | 380836 | 2939 | 2 | page_index=false | + | alltypes_plain.parquet | 1851 | 6957 | 2 | page_index=false | + | alltypes_tiny_pages.parquet | 454233 | 267014 | 2 | page_index=true | + | lz4_raw_compressed_larger.parquet | 380836 | 996 | 2 | page_index=false | +-----------------------------------+-----------------+---------------------+------+------------------+ "); @@ -623,9 +623,9 @@ mod tests { +-----------------------------------+-----------------+---------------------+------+------------------+ | filename | file_size_bytes | metadata_size_bytes | hits | extra | +-----------------------------------+-----------------+---------------------+------+------------------+ - | alltypes_plain.parquet | 1851 | 10181 | 5 | page_index=false | - | alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true | - | lz4_raw_compressed_larger.parquet | 380836 | 2939 | 3 | page_index=false | + | alltypes_plain.parquet | 1851 | 6957 | 5 | page_index=false | + | alltypes_tiny_pages.parquet | 454233 | 267014 | 2 | page_index=true | + | lz4_raw_compressed_larger.parquet | 380836 | 996 | 3 | page_index=false | +-----------------------------------+-----------------+---------------------+------+------------------+ "); diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 68bb5376a1ac..bb0525e57753 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -81,7 +81,7 @@ serde_json = { workspace = true } tempfile = { workspace = true } test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } -tonic = "0.13.1" +tonic = "0.14" tracing = { version = "0.1" } tracing-subscriber = { version = "0.3" } url = { workspace = true } diff --git a/datafusion-examples/examples/flight/flight_client.rs b/datafusion-examples/examples/flight/flight_client.rs index e3237284b430..ff4b5903ad88 100644 --- a/datafusion-examples/examples/flight/flight_client.rs +++ b/datafusion-examples/examples/flight/flight_client.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; use std::sync::Arc; +use tonic::transport::Endpoint; use datafusion::arrow::datatypes::Schema; @@ -34,7 +35,9 @@ async fn main() -> Result<(), Box> { let testdata = datafusion::test_util::parquet_test_data(); // Create Flight client - let mut client = FlightServiceClient::connect("http://localhost:50051").await?; + let endpoint = Endpoint::new("http://localhost:50051")?; + let channel = endpoint.connect().await?; + let mut client = FlightServiceClient::new(channel); // Call get_schema to get the schema of a Parquet file let request = tonic::Request::new(FlightDescriptor { diff --git a/datafusion-examples/examples/flight/flight_server.rs b/datafusion-examples/examples/flight/flight_server.rs index 58bfb7a341c1..22265e415fbd 100644 --- a/datafusion-examples/examples/flight/flight_server.rs +++ b/datafusion-examples/examples/flight/flight_server.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator}; +use arrow::ipc::writer::{CompressionContext, DictionaryTracker, IpcDataGenerator}; use std::sync::Arc; use arrow_flight::{PollInfo, SchemaAsIpc}; @@ -106,6 +106,7 @@ impl FlightService for FlightServiceImpl { // add an initial FlightData message that sends schema let options = arrow::ipc::writer::IpcWriteOptions::default(); + let mut compression_context = CompressionContext::default(); let schema_flight_data = SchemaAsIpc::new(&schema, &options); let mut flights = vec![FlightData::from(schema_flight_data)]; @@ -115,7 +116,7 @@ impl FlightService for FlightServiceImpl { for batch in &results { let (flight_dictionaries, flight_batch) = encoder - .encoded_batch(batch, &mut tracker, &options) + .encode(batch, &mut tracker, &options, &mut compression_context) .map_err(|e: ArrowError| Status::internal(e.to_string()))?; flights.extend(flight_dictionaries.into_iter().map(Into::into)); diff --git a/datafusion-examples/examples/parquet_encrypted.rs b/datafusion-examples/examples/parquet_encrypted.rs index e9e239b7a1c3..690d9f2a5f14 100644 --- a/datafusion-examples/examples/parquet_encrypted.rs +++ b/datafusion-examples/examples/parquet_encrypted.rs @@ -16,12 +16,13 @@ // under the License. use datafusion::common::DataFusionError; -use datafusion::config::TableParquetOptions; +use datafusion::config::{ConfigFileEncryptionProperties, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::logical_expr::{col, lit}; use datafusion::parquet::encryption::decrypt::FileDecryptionProperties; use datafusion::parquet::encryption::encrypt::FileEncryptionProperties; use datafusion::prelude::{ParquetReadOptions, SessionContext}; +use std::sync::Arc; use tempfile::TempDir; #[tokio::main] @@ -55,7 +56,7 @@ async fn main() -> datafusion::common::Result<()> { // Write encrypted parquet let mut options = TableParquetOptions::default(); - options.crypto.file_encryption = Some((&encrypt).into()); + options.crypto.file_encryption = Some(ConfigFileEncryptionProperties::from(&encrypt)); parquet_df .write_parquet( tempfile_str.as_str(), @@ -100,7 +101,8 @@ async fn query_dataframe(df: &DataFrame) -> Result<(), DataFusionError> { // Setup encryption and decryption properties fn setup_encryption( parquet_df: &DataFrame, -) -> Result<(FileEncryptionProperties, FileDecryptionProperties), DataFusionError> { +) -> Result<(Arc, Arc), DataFusionError> +{ let schema = parquet_df.schema(); let footer_key = b"0123456789012345".to_vec(); // 128bit/16 let column_key = b"1234567890123450".to_vec(); // 128bit/16 diff --git a/datafusion-examples/examples/parquet_encrypted_with_kms.rs b/datafusion-examples/examples/parquet_encrypted_with_kms.rs index 19b0e8d0b199..45bfd183773a 100644 --- a/datafusion-examples/examples/parquet_encrypted_with_kms.rs +++ b/datafusion-examples/examples/parquet_encrypted_with_kms.rs @@ -226,7 +226,7 @@ impl EncryptionFactory for TestEncryptionFactory { options: &EncryptionFactoryOptions, schema: &SchemaRef, _file_path: &Path, - ) -> Result> { + ) -> Result>> { let config: EncryptionConfig = options.to_extension_options()?; // Generate a random encryption key for this file. @@ -268,7 +268,7 @@ impl EncryptionFactory for TestEncryptionFactory { &self, _options: &EncryptionFactoryOptions, _file_path: &Path, - ) -> Result> { + ) -> Result>> { let decryption_properties = FileDecryptionProperties::with_key_retriever(Arc::new(TestKeyRetriever {})) .build()?; diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index f5e51cb236d4..abeb4e66a269 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -71,7 +71,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.25", optional = true } +pyo3 = { version = "0.26", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true, optional = true } tokio = { workspace = true } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 271ba6ddcff5..1713377f8d4d 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -26,14 +26,15 @@ use crate::format::{ExplainAnalyzeLevel, ExplainFormat}; use crate::parsers::CompressionTypeVariant; use crate::utils::get_available_parallelism; use crate::{DataFusionError, Result}; +#[cfg(feature = "parquet_encryption")] +use hex; use std::any::Any; use std::collections::{BTreeMap, HashMap}; use std::error::Error; use std::fmt::{self, Display}; use std::str::FromStr; - #[cfg(feature = "parquet_encryption")] -use hex; +use std::sync::Arc; /// A macro that wraps a configuration struct and automatically derives /// [`Default`] and [`ConfigField`] for it, allowing it to be used @@ -2409,13 +2410,13 @@ impl From for FileEncryptionProperties { hex::decode(&val.aad_prefix_as_hex).expect("Invalid AAD prefix"); fep = fep.with_aad_prefix(aad_prefix); } - fep.build().unwrap() + Arc::unwrap_or_clone(fep.build().unwrap()) } } #[cfg(feature = "parquet_encryption")] -impl From<&FileEncryptionProperties> for ConfigFileEncryptionProperties { - fn from(f: &FileEncryptionProperties) -> Self { +impl From<&Arc> for ConfigFileEncryptionProperties { + fn from(f: &Arc) -> Self { let (column_names_vec, column_keys_vec, column_metas_vec) = f.column_keys(); let mut column_encryption_properties: HashMap< @@ -2557,13 +2558,13 @@ impl From for FileDecryptionProperties { fep = fep.with_aad_prefix(aad_prefix); } - fep.build().unwrap() + Arc::unwrap_or_clone(fep.build().unwrap()) } } #[cfg(feature = "parquet_encryption")] -impl From<&FileDecryptionProperties> for ConfigFileDecryptionProperties { - fn from(f: &FileDecryptionProperties) -> Self { +impl From<&Arc> for ConfigFileDecryptionProperties { + fn from(f: &Arc) -> Self { let (column_names_vec, column_keys_vec) = f.column_keys(); let mut column_decryption_properties: HashMap< String, @@ -2834,6 +2835,7 @@ mod tests { }; use std::any::Any; use std::collections::HashMap; + use std::sync::Arc; #[derive(Default, Debug, Clone)] pub struct TestExtensionConfig { @@ -2990,16 +2992,15 @@ mod tests { .unwrap(); // Test round-trip - let config_encrypt: ConfigFileEncryptionProperties = - (&file_encryption_properties).into(); - let encryption_properties_built: FileEncryptionProperties = - config_encrypt.clone().into(); + let config_encrypt = + ConfigFileEncryptionProperties::from(&file_encryption_properties); + let encryption_properties_built = + Arc::new(FileEncryptionProperties::from(config_encrypt.clone())); assert_eq!(file_encryption_properties, encryption_properties_built); - let config_decrypt: ConfigFileDecryptionProperties = - (&decryption_properties).into(); - let decryption_properties_built: FileDecryptionProperties = - config_decrypt.clone().into(); + let config_decrypt = ConfigFileDecryptionProperties::from(&decryption_properties); + let decryption_properties_built = + Arc::new(FileDecryptionProperties::from(config_decrypt.clone())); assert_eq!(decryption_properties, decryption_properties_built); /////////////////////////////////////////////////////////////////////////////////// diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 6866b4011f9e..34a36f543657 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -1417,7 +1417,7 @@ mod tests { fn from_qualified_schema_into_arrow_schema() -> Result<()> { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let arrow_schema = schema.as_arrow(); - insta::assert_snapshot!(arrow_schema, @r#"Field { name: "c0", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c1", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }"#); + insta::assert_snapshot!(arrow_schema.to_string(), @r#"Field { "c0": nullable Boolean }, Field { "c1": nullable Boolean }"#); Ok(()) } diff --git a/datafusion/common/src/encryption.rs b/datafusion/common/src/encryption.rs index b764ad77cff1..2a8cfdbc8996 100644 --- a/datafusion/common/src/encryption.rs +++ b/datafusion/common/src/encryption.rs @@ -24,38 +24,10 @@ pub use parquet::encryption::decrypt::FileDecryptionProperties; pub use parquet::encryption::encrypt::FileEncryptionProperties; #[cfg(not(feature = "parquet_encryption"))] -#[derive(Default, Debug)] +#[derive(Default, Clone, Debug)] pub struct FileDecryptionProperties; #[cfg(not(feature = "parquet_encryption"))] -#[derive(Default, Debug)] +#[derive(Default, Clone, Debug)] pub struct FileEncryptionProperties; pub use crate::config::{ConfigFileDecryptionProperties, ConfigFileEncryptionProperties}; - -#[cfg(feature = "parquet_encryption")] -pub fn map_encryption_to_config_encryption( - encryption: Option<&FileEncryptionProperties>, -) -> Option { - encryption.map(|fe| fe.into()) -} - -#[cfg(not(feature = "parquet_encryption"))] -pub fn map_encryption_to_config_encryption( - _encryption: Option<&FileEncryptionProperties>, -) -> Option { - None -} - -#[cfg(feature = "parquet_encryption")] -pub fn map_config_decryption_to_decryption( - decryption: &ConfigFileDecryptionProperties, -) -> FileDecryptionProperties { - decryption.clone().into() -} - -#[cfg(not(feature = "parquet_encryption"))] -pub fn map_config_decryption_to_decryption( - _decryption: &ConfigFileDecryptionProperties, -) -> FileDecryptionProperties { - FileDecryptionProperties {} -} diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 3977f2b489e1..564929c61bab 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -402,15 +402,14 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result PyResult { + fn to_pyarrow<'py>(&self, py: Python<'py>) -> PyResult> { let array = self.to_array()?; // convert to pyarrow array using C data interface let pyarray = array.to_data().to_pyarrow(py)?; - let pyscalar = pyarray.call_method1(py, "__getitem__", (0,))?; + let pyscalar = pyarray.call_method1("__getitem__", (0,))?; Ok(pyscalar) } @@ -79,23 +79,22 @@ impl<'source> IntoPyObject<'source> for ScalarValue { let array = self.to_array()?; // convert to pyarrow array using C data interface let pyarray = array.to_data().to_pyarrow(py)?; - let pyarray_bound = pyarray.bind(py); - pyarray_bound.call_method1("__getitem__", (0,)) + pyarray.call_method1("__getitem__", (0,)) } } #[cfg(test)] mod tests { use pyo3::ffi::c_str; - use pyo3::prepare_freethreaded_python; use pyo3::py_run; use pyo3::types::PyDict; + use pyo3::Python; use super::*; fn init_python() { - prepare_freethreaded_python(); - Python::with_gil(|py| { + Python::initialize(); + Python::attach(|py| { if py.run(c_str!("import pyarrow"), None, None).is_err() { let locals = PyDict::new(py); py.run( @@ -135,12 +134,11 @@ mod tests { ScalarValue::Date32(Some(1234)), ]; - Python::with_gil(|py| { + Python::attach(|py| { for scalar in example_scalars.iter() { - let result = ScalarValue::from_pyarrow_bound( - scalar.to_pyarrow(py).unwrap().bind(py), - ) - .unwrap(); + let result = + ScalarValue::from_pyarrow_bound(&scalar.to_pyarrow(py).unwrap()) + .unwrap(); assert_eq!(scalar, &result); } }); @@ -150,7 +148,7 @@ mod tests { fn test_py_scalar() -> PyResult<()> { init_python(); - Python::with_gil(|py| -> PyResult<()> { + Python::attach(|py| -> PyResult<()> { let scalar_float = ScalarValue::Float64(Some(12.34)); let py_float = scalar_float .into_pyobject(py)? diff --git a/datafusion/core/benches/parquet_query_sql.rs b/datafusion/core/benches/parquet_query_sql.rs index 14dcdf15f173..e2b381048013 100644 --- a/datafusion/core/benches/parquet_query_sql.rs +++ b/datafusion/core/benches/parquet_query_sql.rs @@ -166,11 +166,12 @@ fn generate_file() -> NamedTempFile { } let metadata = writer.close().unwrap(); + let file_metadata = metadata.file_metadata(); assert_eq!( - metadata.num_rows as usize, + file_metadata.num_rows() as usize, WRITE_RECORD_BATCH_SIZE * NUM_BATCHES ); - assert_eq!(metadata.row_groups.len(), EXPECTED_ROW_GROUPS); + assert_eq!(metadata.row_groups().len(), EXPECTED_ROW_GROUPS); println!( "Generated parquet file in {} seconds", diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index d46a902ca513..930b4fad1d9b 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -116,6 +116,8 @@ mod tests { use datafusion_execution::config::SessionConfig; use datafusion_expr::{col, lit}; + #[cfg(feature = "parquet_encryption")] + use datafusion_common::config::ConfigFileEncryptionProperties; use object_store::local::LocalFileSystem; use parquet::file::reader::FileReader; use tempfile::TempDir; @@ -280,7 +282,8 @@ mod tests { // Write encrypted parquet using write_parquet let mut options = TableParquetOptions::default(); - options.crypto.file_encryption = Some((&encrypt).into()); + options.crypto.file_encryption = + Some(ConfigFileEncryptionProperties::from(&encrypt)); options.global.allow_single_file_parallelism = allow_single_file_parallelism; df.write_parquet( diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 088c4408fff5..1781ea569d90 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -154,7 +154,6 @@ mod tests { use futures::stream::BoxStream; use futures::StreamExt; use insta::assert_snapshot; - use log::error; use object_store::local::LocalFileSystem; use object_store::ObjectMeta; use object_store::{ @@ -163,9 +162,10 @@ mod tests { }; use parquet::arrow::arrow_reader::ArrowReaderOptions; use parquet::arrow::ParquetRecordBatchStreamBuilder; - use parquet::file::metadata::{KeyValue, ParquetColumnIndex, ParquetOffsetIndex}; - use parquet::file::page_index::index::Index; - use parquet::format::FileMetaData; + use parquet::file::metadata::{ + KeyValue, ParquetColumnIndex, ParquetMetaData, ParquetOffsetIndex, + }; + use parquet::file::page_index::column_index::ColumnIndexMetaData; use tokio::fs::File; enum ForceViews { @@ -1144,18 +1144,14 @@ mod tests { // 325 pages in int_col assert_eq!(int_col_offset.len(), 325); - match int_col_index { - Index::INT32(index) => { - assert_eq!(index.indexes.len(), 325); - for min_max in index.clone().indexes { - assert!(min_max.min.is_some()); - assert!(min_max.max.is_some()); - assert!(min_max.null_count.is_some()); - } - } - _ => { - error!("fail to read page index.") - } + let ColumnIndexMetaData::INT32(index) = int_col_index else { + panic!("fail to read page index.") + }; + assert_eq!(index.min_values().len(), 325); + assert_eq!(index.max_values().len(), 325); + // all values are non null + for idx in 0..325 { + assert_eq!(index.null_count(idx), Some(0)); } } @@ -1556,7 +1552,7 @@ mod tests { Ok(parquet_sink) } - fn get_written(parquet_sink: Arc) -> Result<(Path, FileMetaData)> { + fn get_written(parquet_sink: Arc) -> Result<(Path, ParquetMetaData)> { let mut written = parquet_sink.written(); let written = written.drain(); assert_eq!( @@ -1566,28 +1562,33 @@ mod tests { written.len() ); - let (path, file_metadata) = written.take(1).next().unwrap(); - Ok((path, file_metadata)) + let (path, parquet_meta_data) = written.take(1).next().unwrap(); + Ok((path, parquet_meta_data)) } - fn assert_file_metadata(file_metadata: FileMetaData, expected_kv: &Vec) { - let FileMetaData { - num_rows, - schema, - key_value_metadata, - .. - } = file_metadata; - assert_eq!(num_rows, 2, "file metadata to have 2 rows"); + fn assert_file_metadata( + parquet_meta_data: ParquetMetaData, + expected_kv: &Vec, + ) { + let file_metadata = parquet_meta_data.file_metadata(); + let schema_descr = file_metadata.schema_descr(); + assert_eq!(file_metadata.num_rows(), 2, "file metadata to have 2 rows"); assert!( - schema.iter().any(|col_schema| col_schema.name == "a"), + schema_descr + .columns() + .iter() + .any(|col_schema| col_schema.name() == "a"), "output file metadata should contain col a" ); assert!( - schema.iter().any(|col_schema| col_schema.name == "b"), + schema_descr + .columns() + .iter() + .any(|col_schema| col_schema.name() == "b"), "output file metadata should contain col b" ); - let mut key_value_metadata = key_value_metadata.unwrap(); + let mut key_value_metadata = file_metadata.key_value_metadata().unwrap().clone(); key_value_metadata.sort_by(|a, b| a.key.cmp(&b.key)); assert_eq!(&key_value_metadata, expected_kv); } @@ -1644,13 +1645,11 @@ mod tests { // check the file metadata includes partitions let mut expected_partitions = std::collections::HashSet::from(["a=foo", "a=bar"]); - for ( - path, - FileMetaData { - num_rows, schema, .. - }, - ) in written.take(2) - { + for (path, parquet_metadata) in written.take(2) { + let file_metadata = parquet_metadata.file_metadata(); + let schema = file_metadata.schema_descr(); + let num_rows = file_metadata.num_rows(); + let path_parts = path.parts().collect::>(); assert_eq!(path_parts.len(), 2, "should have path prefix"); @@ -1663,11 +1662,17 @@ mod tests { assert_eq!(num_rows, 1, "file metadata to have 1 row"); assert!( - !schema.iter().any(|col_schema| col_schema.name == "a"), + !schema + .columns() + .iter() + .any(|col_schema| col_schema.name() == "a"), "output file metadata will not contain partitioned col a" ); assert!( - schema.iter().any(|col_schema| col_schema.name == "b"), + schema + .columns() + .iter() + .any(|col_schema| col_schema.name() == "b"), "output file metadata should contain col b" ); } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 708c52001ee8..c280b50a9f07 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2644,7 +2644,7 @@ mod tests { // verify that the plan correctly casts u8 to i64 // the cast from u8 to i64 for literal will be simplified, and get lit(int64(5)) // the cast here is implicit so has CastOptions with safe=true - let expected = r#"BinaryExpr { left: Column { name: "c7", index: 2 }, op: Lt, right: Literal { value: Int64(5), field: Field { name: "lit", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }"#; + let expected = r#"BinaryExpr { left: Column { name: "c7", index: 2 }, op: Lt, right: Literal { value: Int64(5), field: Field { name: "lit", data_type: Int64 } }, fail_on_overflow: false"#; assert_contains!(format!("{exec_plan:?}"), expected); Ok(()) @@ -2704,9 +2704,6 @@ mod tests { name: "lit", data_type: Utf8, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c1", @@ -2718,9 +2715,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c2", @@ -2732,9 +2726,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c3", @@ -2843,9 +2834,6 @@ mod tests { name: "lit", data_type: Utf8, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c1", @@ -2857,9 +2845,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c2", @@ -2871,9 +2856,6 @@ mod tests { name: "lit", data_type: Int64, nullable: true, - dict_id: 0, - dict_is_ordered: false, - metadata: {}, }, }, "c3", @@ -3047,7 +3029,7 @@ mod tests { .expect_err("planning error") .strip_backtrace(); - insta::assert_snapshot!(e, @r#"Error during planning: Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: DFSchema { inner: Schema { fields: [Field { name: "a", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }, ExecutionPlan schema: Schema { fields: [Field { name: "b", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }"#); + insta::assert_snapshot!(e, @r#"Error during planning: Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: DFSchema { inner: Schema { fields: [Field { name: "a", data_type: Int32 }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }, ExecutionPlan schema: Schema { fields: [Field { name: "b", data_type: Int32 }], metadata: {} }"#); } #[tokio::test] @@ -3063,7 +3045,7 @@ mod tests { let execution_plan = plan(&logical_plan).await?; // verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated. - let expected = "exprs: [ProjectionExpr { expr: BinaryExpr { left: BinaryExpr { left: Column { name: \"c1\", index: 0 }, op: Eq, right: Literal { value: Utf8(\"a\"), field: Field { name: \"lit\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }, op: Or, right: BinaryExpr { left: Column { name: \"c1\", index: 0 }, op: Eq, right: Literal { value: Utf8(\"1\"), field: Field { name: \"lit\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }, fail_on_overflow: false }"; + let expected = r#"expr: BinaryExpr { left: BinaryExpr { left: Column { name: "c1", index: 0 }, op: Eq, right: Literal { value: Utf8("a"), field: Field { name: "lit", data_type: Utf8 } }, fail_on_overflow: false }"#; assert_contains!(format!("{execution_plan:?}"), expected); @@ -3085,7 +3067,7 @@ mod tests { assert_contains!( &e, - r#"Error during planning: Can not find compatible types to compare Boolean with [Struct(foo Boolean), Utf8]"# + r#"Error during planning: Can not find compatible types to compare Boolean with [Struct("foo": Boolean), Utf8]"# ); Ok(()) diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index d95eb38c19e1..265862ff9af8 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -309,16 +309,16 @@ async fn test_fn_arrow_typeof() -> Result<()> { assert_snapshot!( batches_to_string(&batches), - @r#" - +------------------------------------------------------------------------------------------------------------------+ - | arrow_typeof(test.l) | - +------------------------------------------------------------------------------------------------------------------+ - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - | List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) | - +------------------------------------------------------------------------------------------------------------------+ - "#); + @r" + +----------------------+ + | arrow_typeof(test.l) | + +----------------------+ + | List(nullable Int32) | + | List(nullable Int32) | + | List(nullable Int32) | + | List(nullable Int32) | + +----------------------+ + "); Ok(()) } diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 979ada2bc6bb..17d1695478a5 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -2944,18 +2944,18 @@ async fn test_count_wildcard_on_window() -> Result<()> { assert_snapshot!( pretty_format_batches(&sql_results).unwrap(), @r#" - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | plan_type | plan | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | - | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | - | | TableScan: t1 projection=[a] | - | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | - | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | - | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | - | | DataSourceExec: partitions=1, partition_sizes=[1] | - | | | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | + | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | + | | TableScan: t1 projection=[a] | + | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | + | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | + | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ "# ); @@ -2978,18 +2978,18 @@ async fn test_count_wildcard_on_window() -> Result<()> { assert_snapshot!( pretty_format_batches(&df_results).unwrap(), @r#" - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | plan_type | plan | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | - | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | - | | TableScan: t1 projection=[a] | - | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | - | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | - | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | - | | DataSourceExec: partitions=1, partition_sizes=[1] | - | | | - +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | + | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | + | | TableScan: t1 projection=[a] | + | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | + | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING], mode=[Sorted] | + | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ "# ); @@ -4435,12 +4435,12 @@ async fn unnest_with_redundant_columns() -> Result<()> { let actual = formatted.trim(); assert_snapshot!( actual, - @r###" + @r" Projection: shapes.shape_id [shape_id:UInt32] Unnest: lists[shape_id2|depth=1] structs[] [shape_id:UInt32, shape_id2:UInt32;N] - Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { name: "item", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { data_type: UInt32, nullable: true });N] TableScan: shapes projection=[shape_id] [shape_id:UInt32] - "### + " ); let results = df.collect().await?; diff --git a/datafusion/core/tests/parquet/encryption.rs b/datafusion/core/tests/parquet/encryption.rs index 819d8bf3a283..09b93f06ce85 100644 --- a/datafusion/core/tests/parquet/encryption.rs +++ b/datafusion/core/tests/parquet/encryption.rs @@ -314,7 +314,7 @@ async fn verify_file_encrypted( for col in row_group.columns() { assert!(matches!( col.crypto_metadata(), - Some(ColumnCryptoMetaData::EncryptionWithFooterKey) + Some(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY) )); } } @@ -336,7 +336,7 @@ impl EncryptionFactory for MockEncryptionFactory { config: &EncryptionFactoryOptions, _schema: &SchemaRef, file_path: &object_store::path::Path, - ) -> datafusion_common::Result> { + ) -> datafusion_common::Result>> { assert_eq!( config.options.get("test_key"), Some(&"test value".to_string()) @@ -353,7 +353,7 @@ impl EncryptionFactory for MockEncryptionFactory { &self, config: &EncryptionFactoryOptions, file_path: &object_store::path::Path, - ) -> datafusion_common::Result> { + ) -> datafusion_common::Result>> { assert_eq!( config.options.get("test_key"), Some(&"test value".to_string()) diff --git a/datafusion/core/tests/parquet/filter_pushdown.rs b/datafusion/core/tests/parquet/filter_pushdown.rs index b769fec7d372..226497fe5824 100644 --- a/datafusion/core/tests/parquet/filter_pushdown.rs +++ b/datafusion/core/tests/parquet/filter_pushdown.rs @@ -631,8 +631,8 @@ async fn predicate_cache_pushdown_default() -> datafusion_common::Result<()> { #[tokio::test] async fn predicate_cache_pushdown_disable() -> datafusion_common::Result<()> { - // Can disable the cache even with filter pushdown by setting the size to 0. In this case we - // expect the inner records are reported but no records are read from the cache + // Can disable the cache even with filter pushdown by setting the size to 0. + // This results in no records read from the cache and no metrics reported let mut config = SessionConfig::new(); config.options_mut().execution.parquet.pushdown_filters = true; config @@ -641,13 +641,10 @@ async fn predicate_cache_pushdown_disable() -> datafusion_common::Result<()> { .parquet .max_predicate_cache_size = Some(0); let ctx = SessionContext::new_with_config(config); + // Since the cache is disabled, there is no reporting or use of the cache PredicateCacheTest { - // file has 8 rows, which need to be read twice, one for filter, one for - // final output - expected_inner_records: 16, - // Expect this to 0 records read as the cache is disabled. However, it is - // non zero due to https://github.com/apache/arrow-rs/issues/8307 - expected_records: 3, + expected_inner_records: 0, + expected_records: 0, } .run(&ctx) .await diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index ad77a453350f..620259821871 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -667,12 +667,12 @@ async fn test_soft_hard_requirements_remove_soft_requirement() -> Result<()> { let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet "#); @@ -716,13 +716,13 @@ async fn test_soft_hard_requirements_remove_soft_requirement_without_pushdowns( assert_snapshot!(test.run(), @r#" Input Plan: ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as count] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as count] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet "#); @@ -763,13 +763,13 @@ async fn test_soft_hard_requirements_remove_soft_requirement_without_pushdowns( let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -824,15 +824,15 @@ async fn test_soft_hard_requirements_multiple_soft_requirements() -> Result<()> let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -889,17 +889,17 @@ async fn test_soft_hard_requirements_multiple_soft_requirements() -> Result<()> let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -961,14 +961,14 @@ async fn test_soft_hard_requirements_multiple_sorts() -> Result<()> { Input Plan: SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] ProjectionExec: expr=[nullable_col@0 + non_nullable_col@1 as nullable_col] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -1023,16 +1023,16 @@ async fn test_soft_hard_requirements_with_multiple_soft_requirements_and_output_ assert_snapshot!(test.run(), @r#" Input Plan: OutputRequirementExec: order_by=[(non_nullable_col@1, asc)], dist_by=SinglePartition - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet Optimized Plan: OutputRequirementExec: order_by=[(non_nullable_col@1, asc)], dist_by=SinglePartition - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], file_type=parquet "#); @@ -1081,7 +1081,7 @@ async fn test_window_multi_path_sort() -> Result<()> { let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [nullable_col@0 DESC NULLS LAST] UnionExec SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] @@ -1090,7 +1090,7 @@ async fn test_window_multi_path_sort() -> Result<()> { DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet Optimized Plan: - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] SortPreservingMergeExec: [nullable_col@0 ASC] UnionExec DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC], file_type=parquet @@ -1122,7 +1122,7 @@ async fn test_window_multi_path_sort2() -> Result<()> { let test = EnforceSortingTest::new(physical_plan).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [nullable_col@0 ASC, non_nullable_col@1 ASC] UnionExec SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false] @@ -1131,7 +1131,7 @@ async fn test_window_multi_path_sort2() -> Result<()> { DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [nullable_col@0 ASC] UnionExec DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC], file_type=parquet @@ -1678,7 +1678,7 @@ async fn test_window_multi_layer_requirement() -> Result<()> { EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [a@0 ASC, b@1 ASC] RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC, b@1 ASC RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 @@ -1686,7 +1686,7 @@ async fn test_window_multi_layer_requirement() -> Result<()> { DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortPreservingMergeExec: [a@0 ASC, b@1 ASC] SortExec: expr=[a@0 ASC, b@1 ASC], preserve_partitioning=[true] RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10 @@ -1783,18 +1783,18 @@ async fn test_remove_unnecessary_sort_window_multilayer() -> Result<()> { EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] FilterExec: NOT non_nullable_col@1 SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] CoalesceBatchesExec: target_batch_size=128 SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] Optimized Plan: - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] FilterExec: NOT non_nullable_col@1 - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] CoalesceBatchesExec: target_batch_size=128 SortExec: expr=[non_nullable_col@1 DESC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] @@ -2238,17 +2238,17 @@ async fn test_multiple_sort_window_exec() -> Result<()> { EnforceSortingTest::new(physical_plan.clone()).with_repartition_sorts(true); assert_snapshot!(test.run(), @r#" Input Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] Optimized Plan: - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC, non_nullable_col@1 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] "#); @@ -2273,7 +2273,7 @@ async fn test_commutativity() -> Result<()> { assert_snapshot!(displayable(orig_plan.as_ref()).indent(true), @r#" SortExec: expr=[nullable_col@0 ASC], preserve_partitioning=[false] RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: partitions=1, partition_sizes=[0] "#); @@ -2483,7 +2483,6 @@ async fn test_not_replaced_with_partial_sort_for_unbounded_input() -> Result<()> Ok(()) } -// Test that verifies that an orthogonal sort (a sort on columns not in the input ordering) #[test] fn test_removes_unused_orthogonal_sort() -> Result<()> { let schema = create_test_schema3()?; diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs index 7d6c0484b624..ef233e222912 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting_monotonicity.rs @@ -229,11 +229,11 @@ fn test_window_partial_constant_and_set_monotonicity_0() { @ r#" Input Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet Optimized Plan: - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -253,11 +253,11 @@ fn test_window_partial_constant_and_set_monotonicity_1() { @ r#" Input Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet Optimized Plan: - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -275,15 +275,15 @@ fn test_window_partial_constant_and_set_monotonicity_2() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -299,15 +299,15 @@ fn test_window_partial_constant_and_set_monotonicity_3() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -323,16 +323,16 @@ fn test_window_partial_constant_and_set_monotonicity_4() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -348,16 +348,16 @@ fn test_window_partial_constant_and_set_monotonicity_5() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[non_nullable_col@1 DESC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[non_nullable_col@1 DESC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[non_nullable_col@1 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -373,16 +373,16 @@ fn test_window_partial_constant_and_set_monotonicity_6() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 ASC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 ASC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -398,16 +398,16 @@ fn test_window_partial_constant_and_set_monotonicity_7() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + SortExec: expr=[nullable_col@0 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -427,15 +427,15 @@ fn test_window_partial_constant_and_set_monotonicity_8() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -451,15 +451,15 @@ fn test_window_partial_constant_and_set_monotonicity_9() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -477,7 +477,7 @@ fn test_window_partial_constant_and_set_monotonicity_10() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -497,7 +497,7 @@ fn test_window_partial_constant_and_set_monotonicity_11() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -522,7 +522,7 @@ fn test_window_partial_constant_and_set_monotonicity_12() { @ r#" Input / Optimized Plan: SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -543,7 +543,7 @@ fn test_window_partial_constant_and_set_monotonicity_13() { @ r#" Input / Optimized Plan: SortExec: expr=[non_nullable_col@1 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -564,7 +564,7 @@ fn test_window_partial_constant_and_set_monotonicity_14() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, non_nullable_col@1 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -585,7 +585,7 @@ fn test_window_partial_constant_and_set_monotonicity_15() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -608,15 +608,15 @@ fn test_window_partial_constant_and_set_monotonicity_16() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -633,15 +633,15 @@ fn test_window_partial_constant_and_set_monotonicity_17() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[max@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[max@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -658,15 +658,15 @@ fn test_window_partial_constant_and_set_monotonicity_18() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 ASC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 ASC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -685,7 +685,7 @@ fn test_window_partial_constant_and_set_monotonicity_19() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -710,7 +710,7 @@ fn test_window_partial_constant_and_set_monotonicity_20() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -729,15 +729,15 @@ fn test_window_partial_constant_and_set_monotonicity_21() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -756,7 +756,7 @@ fn test_window_partial_constant_and_set_monotonicity_22() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -777,7 +777,7 @@ fn test_window_partial_constant_and_set_monotonicity_23() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -800,15 +800,15 @@ fn test_window_partial_constant_and_set_monotonicity_24() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 DESC NULLS LAST], preserve_partitioning=[false] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -827,7 +827,7 @@ fn test_window_partial_constant_and_set_monotonicity_25() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -847,7 +847,7 @@ fn test_window_partial_constant_and_set_monotonicity_26() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "#); } @@ -867,7 +867,7 @@ fn test_window_partial_constant_and_set_monotonicity_27() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "#); } @@ -893,7 +893,7 @@ fn test_window_partial_constant_and_set_monotonicity_28() { @ r#" Input / Optimized Plan: SortExec: expr=[count@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[count: Ok(Field { name: "count", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -912,15 +912,15 @@ fn test_window_partial_constant_and_set_monotonicity_29() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] - WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC], preserve_partitioning=[false] + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"#) + Optimized Plan: + WindowAggExec: wdw=[max: Ok(Field { name: "max", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "#) } // Case 30: @@ -937,7 +937,7 @@ fn test_window_partial_constant_and_set_monotonicity_30() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[min: Ok(Field { name: "min", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "#); } @@ -957,7 +957,7 @@ fn test_window_partial_constant_and_set_monotonicity_31() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] + WindowAggExec: wdw=[avg: Ok(Field { name: "avg", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(NULL)), is_causal: false }] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -981,15 +981,15 @@ fn test_window_partial_constant_and_set_monotonicity_32() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1008,7 +1008,7 @@ fn test_window_partial_constant_and_set_monotonicity_33() { @ r#" Input / Optimized Plan: SortExec: expr=[max@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1027,15 +1027,15 @@ fn test_window_partial_constant_and_set_monotonicity_34() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } // Case 35: @@ -1053,7 +1053,7 @@ fn test_window_partial_constant_and_set_monotonicity_35() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1077,15 +1077,15 @@ fn test_window_partial_constant_and_set_monotonicity_36() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1102,15 +1102,15 @@ fn test_window_partial_constant_and_set_monotonicity_37() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1129,7 +1129,7 @@ fn test_window_partial_constant_and_set_monotonicity_38() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1149,7 +1149,7 @@ fn test_window_partial_constant_and_set_monotonicity_39() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1173,15 +1173,15 @@ fn test_window_partial_constant_and_set_monotonicity_40() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1200,7 +1200,7 @@ fn test_window_partial_constant_and_set_monotonicity_41() { @ r#" Input / Optimized Plan: SortExec: expr=[max@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1221,7 +1221,7 @@ fn test_window_partial_constant_and_set_monotonicity_42() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1242,7 +1242,7 @@ fn test_window_partial_constant_and_set_monotonicity_43() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1267,7 +1267,7 @@ fn test_window_partial_constant_and_set_monotonicity_44() { @ r#" Input / Optimized Plan: SortExec: expr=[count@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1288,7 +1288,7 @@ fn test_window_partial_constant_and_set_monotonicity_45() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1307,15 +1307,15 @@ fn test_window_partial_constant_and_set_monotonicity_46() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1331,15 +1331,15 @@ fn test_window_partial_constant_and_set_monotonicity_47() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1361,15 +1361,15 @@ fn test_window_partial_constant_and_set_monotonicity_48() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1387,7 +1387,7 @@ fn test_window_partial_constant_and_set_monotonicity_49() { @ r#" Input / Optimized Plan: SortExec: expr=[max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1406,15 +1406,15 @@ fn test_window_partial_constant_and_set_monotonicity_50() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1432,7 +1432,7 @@ fn test_window_partial_constant_and_set_monotonicity_51() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1458,7 +1458,7 @@ fn test_window_partial_constant_and_set_monotonicity_52() { @ r#" Input / Optimized Plan: SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1479,7 +1479,7 @@ fn test_window_partial_constant_and_set_monotonicity_53() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1499,7 +1499,7 @@ fn test_window_partial_constant_and_set_monotonicity_54() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1517,15 +1517,15 @@ fn test_window_partial_constant_and_set_monotonicity_55() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1547,15 +1547,15 @@ fn test_window_partial_constant_and_set_monotonicity_56() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[count@2 ASC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } @@ -1574,7 +1574,7 @@ fn test_window_partial_constant_and_set_monotonicity_57() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1595,7 +1595,7 @@ fn test_window_partial_constant_and_set_monotonicity_58() { @ r#" Input / Optimized Plan: SortExec: expr=[min@2 DESC NULLS LAST, nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1615,7 +1615,7 @@ fn test_window_partial_constant_and_set_monotonicity_59() { @ r#" Input / Optimized Plan: SortExec: expr=[avg@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1641,7 +1641,7 @@ fn test_window_partial_constant_and_set_monotonicity_60() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, count@2 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1662,7 +1662,7 @@ fn test_window_partial_constant_and_set_monotonicity_61() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, max@2 ASC], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[max: Field { name: "max", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[max: Field { "max": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1683,7 +1683,7 @@ fn test_window_partial_constant_and_set_monotonicity_62() { @ r#" Input / Optimized Plan: SortExec: expr=[nullable_col@0 ASC NULLS LAST, min@2 DESC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[min: Field { name: "min", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[min: Field { "min": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet "# ); @@ -1701,15 +1701,15 @@ fn test_window_partial_constant_and_set_monotonicity_63() { ], }.run(), @ r#" -Input Plan: -SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] - BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + Input Plan: + SortExec: expr=[nullable_col@0 ASC NULLS LAST], preserve_partitioning=[false] + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -Optimized Plan: -BoundedWindowAggExec: wdw=[avg: Field { name: "avg", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet -"# + Optimized Plan: + BoundedWindowAggExec: wdw=[avg: Field { "avg": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC NULLS LAST], file_type=parquet + "# ); } // =============================================REGION ENDS============================================= diff --git a/datafusion/core/tests/physical_optimizer/sanity_checker.rs b/datafusion/core/tests/physical_optimizer/sanity_checker.rs index ce6eb13c86c4..9867ed173341 100644 --- a/datafusion/core/tests/physical_optimizer/sanity_checker.rs +++ b/datafusion/core/tests/physical_optimizer/sanity_checker.rs @@ -421,7 +421,7 @@ async fn test_bounded_window_agg_sort_requirement() -> Result<()> { assert_snapshot!( actual, @r#" - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] SortExec: expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false] DataSourceExec: partitions=1, partition_sizes=[0] "# @@ -449,7 +449,7 @@ async fn test_bounded_window_agg_no_sort_requirement() -> Result<()> { assert_snapshot!( actual, @r#" - BoundedWindowAggExec: wdw=[count: Field { name: "count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[count: Field { "count": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: partitions=1, partition_sizes=[0] "# ); diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index 2eb3ba36dd90..8a0f62062738 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -222,11 +222,11 @@ async fn test_parameter_invalid_types() -> Result<()> { .collect() .await; assert_snapshot!(results.unwrap_err().strip_backtrace(), - @r#" - type_coercion - caused by - Error during planning: Cannot infer common argument type for comparison operation List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) = Int32 - "#); + @r" + type_coercion + caused by + Error during planning: Cannot infer common argument type for comparison operation List(nullable Int32) = Int32 + "); Ok(()) } diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 963c1d77950c..f27bda387fda 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -38,8 +38,6 @@ use datafusion_datasource::write::demux::DemuxedStreamReceiver; use arrow::datatypes::{DataType, Field, FieldRef}; use datafusion_common::config::{ConfigField, ConfigFileType, TableParquetOptions}; -#[cfg(feature = "parquet_encryption")] -use datafusion_common::encryption::map_config_decryption_to_decryption; use datafusion_common::encryption::FileDecryptionProperties; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ @@ -59,11 +57,13 @@ use datafusion_physical_expr_common::sort_expr::LexRequirement; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use datafusion_session::Session; +use crate::metadata::DFParquetMetadata; use crate::reader::CachedParquetFileReaderFactory; use crate::source::{parse_coerce_int96_string, ParquetSource}; use async_trait::async_trait; use bytes::Bytes; use datafusion_datasource::source::DataSourceExec; +use datafusion_execution::cache::cache_manager::FileMetadataCache; use datafusion_execution::runtime_env::RuntimeEnv; use futures::future::BoxFuture; use futures::{FutureExt, StreamExt, TryStreamExt}; @@ -77,14 +77,12 @@ use parquet::arrow::arrow_writer::{ use parquet::arrow::async_reader::MetadataFetch; use parquet::arrow::{ArrowWriter, AsyncArrowWriter}; use parquet::basic::Type; - -use crate::metadata::DFParquetMetadata; -use datafusion_execution::cache::cache_manager::FileMetadataCache; +#[cfg(feature = "parquet_encryption")] +use parquet::encryption::encrypt::FileEncryptionProperties; use parquet::errors::ParquetError; use parquet::file::metadata::ParquetMetaData; use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder}; use parquet::file::writer::SerializedFileWriter; -use parquet::format::FileMetaData; use parquet::schema::types::SchemaDescriptor; use tokio::io::{AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc::{self, Receiver, Sender}; @@ -306,25 +304,23 @@ async fn get_file_decryption_properties( state: &dyn Session, options: &TableParquetOptions, file_path: &Path, -) -> Result> { - let file_decryption_properties: Option = - match &options.crypto.file_decryption { - Some(cfd) => Some(map_config_decryption_to_decryption(cfd)), - None => match &options.crypto.factory_id { - Some(factory_id) => { - let factory = - state.runtime_env().parquet_encryption_factory(factory_id)?; - factory - .get_file_decryption_properties( - &options.crypto.factory_options, - file_path, - ) - .await? - } - None => None, - }, - }; - Ok(file_decryption_properties) +) -> Result>> { + Ok(match &options.crypto.file_decryption { + Some(cfd) => Some(Arc::new(FileDecryptionProperties::from(cfd.clone()))), + None => match &options.crypto.factory_id { + Some(factory_id) => { + let factory = + state.runtime_env().parquet_encryption_factory(factory_id)?; + factory + .get_file_decryption_properties( + &options.crypto.factory_options, + file_path, + ) + .await? + } + None => None, + }, + }) } #[cfg(not(feature = "parquet_encryption"))] @@ -332,7 +328,7 @@ async fn get_file_decryption_properties( _state: &dyn Session, _options: &TableParquetOptions, _file_path: &Path, -) -> Result> { +) -> Result>> { Ok(None) } @@ -385,7 +381,7 @@ impl FileFormat for ParquetFormat { .await?; let result = DFParquetMetadata::new(store.as_ref(), object) .with_metadata_size_hint(self.metadata_size_hint()) - .with_decryption_properties(file_decryption_properties.as_ref()) + .with_decryption_properties(file_decryption_properties) .with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))) .with_coerce_int96(coerce_int96) .fetch_schema_with_location() @@ -446,7 +442,7 @@ impl FileFormat for ParquetFormat { state.runtime_env().cache_manager.get_file_metadata_cache(); DFParquetMetadata::new(store, object) .with_metadata_size_hint(self.metadata_size_hint()) - .with_decryption_properties(file_decryption_properties.as_ref()) + .with_decryption_properties(file_decryption_properties) .with_file_metadata_cache(Some(file_metadata_cache)) .fetch_statistics(&table_schema) .await @@ -1027,9 +1023,10 @@ pub async fn fetch_parquet_metadata( store: &dyn ObjectStore, object_meta: &ObjectMeta, size_hint: Option, - #[allow(unused)] decryption_properties: Option<&FileDecryptionProperties>, + decryption_properties: Option<&FileDecryptionProperties>, file_metadata_cache: Option>, ) -> Result> { + let decryption_properties = decryption_properties.cloned().map(Arc::new); DFParquetMetadata::new(store, object_meta) .with_metadata_size_hint(size_hint) .with_decryption_properties(decryption_properties) @@ -1053,6 +1050,7 @@ pub async fn fetch_statistics( decryption_properties: Option<&FileDecryptionProperties>, file_metadata_cache: Option>, ) -> Result { + let decryption_properties = decryption_properties.cloned().map(Arc::new); DFParquetMetadata::new(store, file) .with_metadata_size_hint(metadata_size_hint) .with_decryption_properties(decryption_properties) @@ -1080,7 +1078,7 @@ pub struct ParquetSink { parquet_options: TableParquetOptions, /// File metadata from successfully produced parquet files. The Mutex is only used /// to allow inserting to HashMap from behind borrowed reference in DataSink::write_all. - written: Arc>>, + written: Arc>>, } impl Debug for ParquetSink { @@ -1117,7 +1115,7 @@ impl ParquetSink { /// Retrieve the file metadata for the written files, keyed to the path /// which may be partitioned (in the case of hive style partitioning). - pub fn written(&self) -> HashMap { + pub fn written(&self) -> HashMap { self.written.lock().clone() } @@ -1141,7 +1139,7 @@ impl ParquetSink { builder = set_writer_encryption_properties( builder, runtime, - &parquet_opts, + parquet_opts, schema, path, ) @@ -1189,14 +1187,15 @@ impl ParquetSink { async fn set_writer_encryption_properties( builder: WriterPropertiesBuilder, runtime: &Arc, - parquet_opts: &TableParquetOptions, + parquet_opts: TableParquetOptions, schema: &Arc, path: &Path, ) -> Result { - if let Some(file_encryption_properties) = &parquet_opts.crypto.file_encryption { + if let Some(file_encryption_properties) = parquet_opts.crypto.file_encryption { // Encryption properties have been specified directly - return Ok(builder - .with_file_encryption_properties(file_encryption_properties.clone().into())); + return Ok(builder.with_file_encryption_properties(Arc::new( + FileEncryptionProperties::from(file_encryption_properties), + ))); } else if let Some(encryption_factory_id) = &parquet_opts.crypto.factory_id.as_ref() { // Encryption properties will be generated by an encryption factory let encryption_factory = @@ -1221,7 +1220,7 @@ async fn set_writer_encryption_properties( async fn set_writer_encryption_properties( builder: WriterPropertiesBuilder, _runtime: &Arc, - _parquet_opts: &TableParquetOptions, + _parquet_opts: TableParquetOptions, _schema: &Arc, _path: &Path, ) -> Result { @@ -1244,7 +1243,7 @@ impl FileSink for ParquetSink { let parquet_opts = &self.parquet_options; let mut file_write_tasks: JoinSet< - std::result::Result<(Path, FileMetaData), DataFusionError>, + std::result::Result<(Path, ParquetMetaData), DataFusionError>, > = JoinSet::new(); let runtime = context.runtime_env(); @@ -1275,11 +1274,11 @@ impl FileSink for ParquetSink { writer.write(&batch).await?; reservation.try_resize(writer.memory_size())?; } - let file_metadata = writer + let parquet_meta_data = writer .close() .await .map_err(|e| DataFusionError::ParquetError(Box::new(e)))?; - Ok((path, file_metadata)) + Ok((path, parquet_meta_data)) }); } else { let writer = ObjectWriterBuilder::new( @@ -1303,7 +1302,7 @@ impl FileSink for ParquetSink { let parallel_options_clone = parallel_options.clone(); let pool = Arc::clone(context.memory_pool()); file_write_tasks.spawn(async move { - let file_metadata = output_single_parquet_file_parallelized( + let parquet_meta_data = output_single_parquet_file_parallelized( writer, rx, schema, @@ -1313,7 +1312,7 @@ impl FileSink for ParquetSink { pool, ) .await?; - Ok((path, file_metadata)) + Ok((path, parquet_meta_data)) }); } } @@ -1322,11 +1321,11 @@ impl FileSink for ParquetSink { while let Some(result) = file_write_tasks.join_next().await { match result { Ok(r) => { - let (path, file_metadata) = r?; - row_count += file_metadata.num_rows; + let (path, parquet_meta_data) = r?; + row_count += parquet_meta_data.file_metadata().num_rows(); let mut written_files = self.written.lock(); written_files - .try_insert(path.clone(), file_metadata) + .try_insert(path.clone(), parquet_meta_data) .map_err(|e| internal_datafusion_err!("duplicate entry detected for partitioned file {path}: {e}"))?; drop(written_files); } @@ -1589,7 +1588,7 @@ async fn concatenate_parallel_row_groups( mut serialize_rx: Receiver>, mut object_store_writer: Box, pool: Arc, -) -> Result { +) -> Result { let mut file_reservation = MemoryConsumer::new("ParquetSink(SerializedFileWriter)").register(&pool); @@ -1617,14 +1616,14 @@ async fn concatenate_parallel_row_groups( rg_out.close()?; } - let file_metadata = parquet_writer.close()?; + let parquet_meta_data = parquet_writer.close()?; let final_buff = merged_buff.buffer.try_lock().unwrap(); object_store_writer.write_all(final_buff.as_slice()).await?; object_store_writer.shutdown().await?; file_reservation.free(); - Ok(file_metadata) + Ok(parquet_meta_data) } /// Parallelizes the serialization of a single parquet file, by first serializing N @@ -1639,7 +1638,7 @@ async fn output_single_parquet_file_parallelized( skip_arrow_metadata: bool, parallel_options: ParallelParquetWriterOptions, pool: Arc, -) -> Result { +) -> Result { let max_rowgroups = parallel_options.max_parallel_row_groups; // Buffer size of this channel limits maximum number of RowGroups being worked on in parallel let (serialize_tx, serialize_rx) = @@ -1666,7 +1665,7 @@ async fn output_single_parquet_file_parallelized( parallel_options, Arc::clone(&pool), ); - let file_metadata = concatenate_parallel_row_groups( + let parquet_meta_data = concatenate_parallel_row_groups( writer, merged_buff, serialize_rx, @@ -1679,7 +1678,7 @@ async fn output_single_parquet_file_parallelized( .join_unwind() .await .map_err(|e| DataFusionError::ExecutionJoin(Box::new(e)))??; - Ok(file_metadata) + Ok(parquet_meta_data) } #[cfg(test)] diff --git a/datafusion/datasource-parquet/src/metadata.rs b/datafusion/datasource-parquet/src/metadata.rs index c8ee4d3b9f57..6505a447d7ce 100644 --- a/datafusion/datasource-parquet/src/metadata.rs +++ b/datafusion/datasource-parquet/src/metadata.rs @@ -58,7 +58,7 @@ pub struct DFParquetMetadata<'a> { store: &'a dyn ObjectStore, object_meta: &'a ObjectMeta, metadata_size_hint: Option, - decryption_properties: Option<&'a FileDecryptionProperties>, + decryption_properties: Option>, file_metadata_cache: Option>, /// timeunit to coerce INT96 timestamps to pub coerce_int96: Option, @@ -85,7 +85,7 @@ impl<'a> DFParquetMetadata<'a> { /// set decryption properties pub fn with_decryption_properties( mut self, - decryption_properties: Option<&'a FileDecryptionProperties>, + decryption_properties: Option>, ) -> Self { self.decryption_properties = decryption_properties; self @@ -145,7 +145,8 @@ impl<'a> DFParquetMetadata<'a> { #[cfg(feature = "parquet_encryption")] if let Some(decryption_properties) = decryption_properties { - reader = reader.with_decryption_properties(Some(decryption_properties)); + reader = reader + .with_decryption_properties(Some(Arc::clone(decryption_properties))); } if cache_metadata && file_metadata_cache.is_some() { diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 167fc3c5147e..af7a537ca6f4 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -208,7 +208,7 @@ impl FileOpener for ParquetOpener { let mut options = ArrowReaderOptions::new().with_page_index(false); #[cfg(feature = "parquet_encryption")] if let Some(fd_val) = file_decryption_properties { - options = options.with_file_decryption_properties((*fd_val).clone()); + options = options.with_file_decryption_properties(Arc::clone(&fd_val)); } let mut metadata_timer = file_metrics.metadata_load_time.timer(); @@ -581,8 +581,7 @@ impl EncryptionContext { None => match &self.encryption_factory { Some((encryption_factory, encryption_config)) => Ok(encryption_factory .get_file_decryption_properties(encryption_config, file_location) - .await? - .map(Arc::new)), + .await?), None => Ok(None), }, } diff --git a/datafusion/datasource-parquet/src/page_filter.rs b/datafusion/datasource-parquet/src/page_filter.rs index 5f3e05747d40..65d1affb44a9 100644 --- a/datafusion/datasource-parquet/src/page_filter.rs +++ b/datafusion/datasource-parquet/src/page_filter.rs @@ -36,7 +36,7 @@ use datafusion_pruning::PruningPredicate; use log::{debug, trace}; use parquet::arrow::arrow_reader::statistics::StatisticsConverter; use parquet::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex}; -use parquet::format::PageLocation; +use parquet::file::page_index::offset_index::PageLocation; use parquet::schema::types::SchemaDescriptor; use parquet::{ arrow::arrow_reader::{RowSelection, RowSelector}, diff --git a/datafusion/datasource-parquet/src/reader.rs b/datafusion/datasource-parquet/src/reader.rs index 687a7f15fccc..88a3cea5623b 100644 --- a/datafusion/datasource-parquet/src/reader.rs +++ b/datafusion/datasource-parquet/src/reader.rs @@ -262,8 +262,9 @@ impl AsyncFileReader for CachedParquetFileReader { async move { #[cfg(feature = "parquet_encryption")] - let file_decryption_properties = - options.and_then(|o| o.file_decryption_properties()); + let file_decryption_properties = options + .and_then(|o| o.file_decryption_properties()) + .map(Arc::clone); #[cfg(not(feature = "parquet_encryption"))] let file_decryption_properties = None; diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 20d71692926f..186d922fc373 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -52,12 +52,12 @@ use datafusion_physical_plan::metrics::Count; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::DisplayFormatType; -#[cfg(feature = "parquet_encryption")] -use datafusion_common::encryption::map_config_decryption_to_decryption; #[cfg(feature = "parquet_encryption")] use datafusion_execution::parquet_encryption::EncryptionFactory; use itertools::Itertools; use object_store::ObjectStore; +#[cfg(feature = "parquet_encryption")] +use parquet::encryption::decrypt::FileDecryptionProperties; /// Execution plan for reading one or more Parquet files. /// @@ -547,8 +547,8 @@ impl FileSource for ParquetSource { .table_parquet_options() .crypto .file_decryption - .as_ref() - .map(map_config_decryption_to_decryption) + .clone() + .map(FileDecryptionProperties::from) .map(Arc::new); let coerce_int96 = self diff --git a/datafusion/execution/src/parquet_encryption.rs b/datafusion/execution/src/parquet_encryption.rs index 73881e11ca72..027421e08f54 100644 --- a/datafusion/execution/src/parquet_encryption.rs +++ b/datafusion/execution/src/parquet_encryption.rs @@ -41,14 +41,14 @@ pub trait EncryptionFactory: Send + Sync + std::fmt::Debug + 'static { config: &EncryptionFactoryOptions, schema: &SchemaRef, file_path: &Path, - ) -> Result>; + ) -> Result>>; /// Generate file decryption properties to use when reading a Parquet file. async fn get_file_decryption_properties( &self, config: &EncryptionFactoryOptions, file_path: &Path, - ) -> Result>; + ) -> Result>>; } /// Stores [`EncryptionFactory`] implementations that can be retrieved by a unique string identifier diff --git a/datafusion/functions-aggregate-common/src/utils.rs b/datafusion/functions-aggregate-common/src/utils.rs index b01f2c8629c9..7ce5f09373f5 100644 --- a/datafusion/functions-aggregate-common/src/utils.rs +++ b/datafusion/functions-aggregate-common/src/utils.rs @@ -95,6 +95,8 @@ pub struct DecimalAverager { target_mul: T::Native, /// the output precision target_precision: u8, + /// the output scale + target_scale: i8, } impl DecimalAverager { @@ -129,6 +131,7 @@ impl DecimalAverager { sum_mul, target_mul, target_precision, + target_scale, }) } else { // can't convert the lit decimal to the returned data type @@ -147,8 +150,11 @@ impl DecimalAverager { if let Ok(value) = sum.mul_checked(self.target_mul.div_wrapping(self.sum_mul)) { let new_value = value.div_wrapping(count); - let validate = - T::validate_decimal_precision(new_value, self.target_precision); + let validate = T::validate_decimal_precision( + new_value, + self.target_precision, + self.target_scale, + ); if validate.is_ok() { Ok(new_value) diff --git a/datafusion/functions/src/core/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs index 94a41ba4bb25..c4e58601cd10 100644 --- a/datafusion/functions/src/core/arrow_cast.rs +++ b/datafusion/functions/src/core/arrow_cast.rs @@ -60,16 +60,26 @@ use datafusion_macros::user_doc; description = "Casts a value to a specific Arrow data type.", syntax_example = "arrow_cast(expression, datatype)", sql_example = r#"```sql -> select arrow_cast(-5, 'Int8') as a, +> select + arrow_cast(-5, 'Int8') as a, arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b, - arrow_cast('bar', 'LargeUtf8') as c, - arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d - ; -+----+-----+-----+---------------------------+ -| a | b | c | d | -+----+-----+-----+---------------------------+ -| -5 | foo | bar | 2023-01-02T12:53:02+08:00 | -+----+-----+-----+---------------------------+ + arrow_cast('bar', 'LargeUtf8') as c; + ++----+-----+-----+ +| a | b | c | ++----+-----+-----+ +| -5 | foo | bar | ++----+-----+-----+ + +> select + arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs, "+08:00")') as d, + arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs)') as e; + ++---------------------------+---------------------+ +| d | e | ++---------------------------+---------------------+ +| 2023-01-02T12:53:02+08:00 | 2023-01-02T12:53:02 | ++---------------------------+---------------------+ ```"#, argument( name = "expression", diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs index 74e286de0f58..c4e89743bd55 100644 --- a/datafusion/functions/src/datetime/date_bin.rs +++ b/datafusion/functions/src/datetime/date_bin.rs @@ -687,7 +687,7 @@ mod tests { let res = invoke_date_bin_with_args(args, 1, return_field); assert_eq!( res.err().unwrap().strip_backtrace(), - "Execution error: DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision but got Timestamp(Microsecond, None)" + "Execution error: DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision but got Timestamp(µs)" ); args = vec![ diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 3d5dee3a7255..4fb0f8553b4b 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -2117,7 +2117,7 @@ mod test { assert_analyzed_plan_eq!( plan, @r#" - Projection: CAST(Utf8("1998-03-18") AS Timestamp(Nanosecond, None)) = CAST(CAST(Utf8("1998-03-18") AS Date32) AS Timestamp(Nanosecond, None)) + Projection: CAST(Utf8("1998-03-18") AS Timestamp(ns)) = CAST(CAST(Utf8("1998-03-18") AS Date32) AS Timestamp(ns)) EmptyRelation: rows=0 "# ) @@ -2258,7 +2258,7 @@ mod test { let err = coerce_case_expression(case, &schema).unwrap_err(); assert_snapshot!( err.strip_backtrace(), - @"Error during planning: Failed to coerce then (Date32, Float32, Binary) and else (Timestamp(Nanosecond, None)) to common types in CASE WHEN expression" + @"Error during planning: Failed to coerce then (Date32, Float32, Binary) and else (Timestamp(ns)) to common types in CASE WHEN expression" ); Ok(()) @@ -2465,7 +2465,7 @@ mod test { assert_analyzed_plan_eq!( plan, @r#" - Projection: a = CAST(CAST(a AS Map(Field { name: "key_value", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false)) AS Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false)) + Projection: a = CAST(CAST(a AS Map("key_value": Struct("key": Utf8, "value": nullable Float64), unsorted)) AS Map("entries": Struct("key": Utf8, "value": nullable Float64), unsorted)) EmptyRelation: rows=0 "# ) @@ -2488,7 +2488,7 @@ mod test { assert_analyzed_plan_eq!( plan, @r#" - Projection: IntervalYearMonth("12") + CAST(Utf8("2000-01-01T00:00:00") AS Timestamp(Nanosecond, None)) + Projection: IntervalYearMonth("12") + CAST(Utf8("2000-01-01T00:00:00") AS Timestamp(ns)) EmptyRelation: rows=0 "# ) @@ -2513,7 +2513,7 @@ mod test { assert_analyzed_plan_eq!( plan, @r#" - Projection: CAST(Utf8("1998-03-18") AS Timestamp(Nanosecond, None)) - CAST(Utf8("1998-03-18") AS Timestamp(Nanosecond, None)) + Projection: CAST(Utf8("1998-03-18") AS Timestamp(ns)) - CAST(Utf8("1998-03-18") AS Timestamp(ns)) EmptyRelation: rows=0 "# ) diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index c8be689fc5a4..ccf90f91e68f 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -1972,14 +1972,14 @@ mod tests { assert_optimized_plan_equal!( plan, - @r#" + @r" Projection: test.b [b:UInt32] LeftSemi Join: Filter: Boolean(true) [a:UInt32, b:UInt32, c:UInt32] TableScan: test [a:UInt32, b:UInt32, c:UInt32] SubqueryAlias: __correlated_sq_1 [arr:Int32;N] Unnest: lists[sq.arr|depth=1] structs[] [arr:Int32;N] - TableScan: sq [arr:List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - "# + TableScan: sq [arr:List(Field { data_type: Int32, nullable: true });N] + " ) } @@ -2007,14 +2007,14 @@ mod tests { assert_optimized_plan_equal!( plan, - @r#" + @r" Projection: test.b [b:UInt32] LeftSemi Join: Filter: __correlated_sq_1.a = test.b [a:UInt32, b:UInt32, c:UInt32] TableScan: test [a:UInt32, b:UInt32, c:UInt32] SubqueryAlias: __correlated_sq_1 [a:UInt32;N] Unnest: lists[sq.a|depth=1] structs[] [a:UInt32;N] - TableScan: sq [a:List(Field { name: "item", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - "# + TableScan: sq [a:List(Field { data_type: UInt32, nullable: true });N] + " ) } diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 407e3e6a9d29..0419161b532c 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -439,8 +439,8 @@ mod tests { let expression = cast_with_options(col("a", &schema)?, &schema, Decimal128(6, 2), None)?; let e = expression.evaluate(&batch).unwrap_err().strip_backtrace(); // panics on OK - assert_snapshot!(e, @"Arrow error: Invalid argument error: 12345679 is too large to store in a Decimal128 of precision 6. Max is 999999"); - + assert_snapshot!(e, @"Arrow error: Invalid argument error: 123456.79 is too large to store in a Decimal128 of precision 6. Max is 9999.99"); + // safe cast should return null let expression_safe = cast_with_options( col("a", &schema)?, &schema, diff --git a/datafusion/physical-expr/src/expressions/dynamic_filters.rs b/datafusion/physical-expr/src/expressions/dynamic_filters.rs index a53b32c97689..964a193db833 100644 --- a/datafusion/physical-expr/src/expressions/dynamic_filters.rs +++ b/datafusion/physical-expr/src/expressions/dynamic_filters.rs @@ -381,14 +381,14 @@ mod test { ) .unwrap(); let snap = dynamic_filter_1.snapshot().unwrap().unwrap(); - insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 0 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }"#); + insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 0 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32 } }, fail_on_overflow: false }"#); let dynamic_filter_2 = reassign_expr_columns( Arc::clone(&dynamic_filter) as Arc, &filter_schema_2, ) .unwrap(); let snap = dynamic_filter_2.snapshot().unwrap().unwrap(); - insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 1 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, fail_on_overflow: false }"#); + insta::assert_snapshot!(format!("{snap:?}"), @r#"BinaryExpr { left: Column { name: "a", index: 1 }, op: Eq, right: Literal { value: Int32(42), field: Field { name: "lit", data_type: Int32 } }, fail_on_overflow: false }"#); // Both filters allow evaluating the same expression let batch_1 = RecordBatch::try_new( Arc::clone(&filter_schema_1), diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 891fd0ae4851..a76316369ec7 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -1696,7 +1696,7 @@ mod tests { // Get string representation of the plan assert_snapshot!(displayable(physical_plan.as_ref()).indent(true), @r#" - BoundedWindowAggExec: wdw=[last: Field { name: "last", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(-1): Field { name: "nth_value(-1)", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(-2): Field { name: "nth_value(-2)", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] + BoundedWindowAggExec: wdw=[last: Field { "last": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(-1): Field { "nth_value(-1)": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(-2): Field { "nth_value(-2)": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] DataSourceExec: partitions=1, partition_sizes=[3] "#); @@ -1814,7 +1814,7 @@ mod tests { // Get string representation of the plan assert_snapshot!(displayable(plan.as_ref()).indent(true), @r#" ProjectionExec: expr=[sn@0 as sn, hash@1 as hash, count([Column { name: "sn", index: 0 }]) PARTITION BY: [[Column { name: "hash", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: "sn", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]@2 as col_2] - BoundedWindowAggExec: wdw=[count([Column { name: "sn", index: 0 }]) PARTITION BY: [[Column { name: "hash", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: "sn", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]: Field { name: "count([Column { name: \"sn\", index: 0 }]) PARTITION BY: [[Column { name: \"hash\", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: \"sn\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[Linear] + BoundedWindowAggExec: wdw=[count([Column { name: "sn", index: 0 }]) PARTITION BY: [[Column { name: "hash", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: "sn", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]: Field { "count([Column { name: \"sn\", index: 0 }]) PARTITION BY: [[Column { name: \"hash\", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: \"sn\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]": Int64 }, frame: RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[Linear] StreamingTableExec: partition_sizes=1, projection=[sn, hash], infinite_source=true, output_ordering=[sn@0 ASC NULLS LAST] "#); diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 8e4131479e50..e9de1d9e9a9e 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -28,7 +28,9 @@ use arrow::datatypes::{ DataType, Field, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema, SchemaRef, TimeUnit, UnionMode, }; -use arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator}; +use arrow::ipc::writer::{ + CompressionContext, DictionaryTracker, IpcDataGenerator, IpcWriteOptions, +}; use datafusion_common::{ config::{ CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions, @@ -1018,8 +1020,15 @@ fn encode_scalar_nested_value( let gen = IpcDataGenerator {}; let mut dict_tracker = DictionaryTracker::new(false); + let write_options = IpcWriteOptions::default(); + let mut compression_context = CompressionContext::default(); let (encoded_dictionaries, encoded_message) = gen - .encoded_batch(&batch, &mut dict_tracker, &Default::default()) + .encode( + &batch, + &mut dict_tracker, + &write_options, + &mut compression_context, + ) .map_err(|e| { Error::General(format!("Error encoding ScalarValue::List as IPC: {e}")) })?; diff --git a/datafusion/proto/src/bytes/mod.rs b/datafusion/proto/src/bytes/mod.rs index 12d9938373ce..6eab2239015a 100644 --- a/datafusion/proto/src/bytes/mod.rs +++ b/datafusion/proto/src/bytes/mod.rs @@ -313,7 +313,7 @@ pub fn physical_plan_from_json( let back: protobuf::PhysicalPlanNode = serde_json::from_str(json) .map_err(|e| plan_datafusion_err!("Error serializing plan: {e}"))?; let extension_codec = DefaultPhysicalExtensionCodec {}; - back.try_into_physical_plan(&ctx, &extension_codec) + back.try_into_physical_plan(ctx, &extension_codec) } /// Deserialize a PhysicalPlan from bytes diff --git a/datafusion/sql/tests/cases/params.rs b/datafusion/sql/tests/cases/params.rs index 4a484b1171bc..147628656d8f 100644 --- a/datafusion/sql/tests/cases/params.rs +++ b/datafusion/sql/tests/cases/params.rs @@ -667,11 +667,11 @@ fn test_insert_infer() { @r#" ** Initial Plan: Dml: op=[Insert Into] table=[person] - Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀 + Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀 Values: ($1, $2, $3) ** Final Plan: Dml: op=[Insert Into] table=[person] - Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀 + Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀 Values: (UInt32(1) AS $1, Utf8("Alan") AS $2, Utf8("Turing") AS $3) "# ); @@ -698,11 +698,11 @@ fn test_prepare_statement_insert_infer() { ** Initial Plan: Prepare: "my_plan" [UInt32, Utf8, Utf8] Dml: op=[Insert Into] table=[person] - Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀 + Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀 Values: ($1, $2, $3) ** Final Plan: Dml: op=[Insert Into] table=[person] - Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀 + Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀 Values: (UInt32(1) AS $1, Utf8("Alan") AS $2, Utf8("Turing") AS $3) "# ); diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index f66af28f436e..96d9f23522f1 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -669,10 +669,10 @@ fn plan_insert() { assert_snapshot!( plan, @r#" - Dml: op=[Insert Into] table=[person] - Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(Nanosecond, None)) AS birth_date, CAST(NULL AS Int32) AS 😀 - Values: (CAST(Int64(1) AS UInt32), Utf8("Alan"), Utf8("Turing")) - "# + Dml: op=[Insert Into] table=[person] + Projection: column1 AS id, column2 AS first_name, column3 AS last_name, CAST(NULL AS Int32) AS age, CAST(NULL AS Utf8) AS state, CAST(NULL AS Float64) AS salary, CAST(NULL AS Timestamp(ns)) AS birth_date, CAST(NULL AS Int32) AS 😀 + Values: (CAST(Int64(1) AS UInt32), Utf8("Alan"), Utf8("Turing")) + "# ); } @@ -875,11 +875,11 @@ fn test_timestamp_filter() { let plan = logical_plan(sql).unwrap(); assert_snapshot!( plan, - @r#" - Projection: person.state - Filter: person.birth_date < CAST(CAST(Int64(158412331400600000) AS Timestamp(Second, None)) AS Timestamp(Nanosecond, None)) - TableScan: person - "# + @r" + Projection: person.state + Filter: person.birth_date < CAST(CAST(Int64(158412331400600000) AS Timestamp(s)) AS Timestamp(ns)) + TableScan: person + " ); } @@ -1586,11 +1586,11 @@ fn select_from_typed_string_values() { assert_snapshot!( plan, @r#" - Projection: t.col1, t.col2 - SubqueryAlias: t - Projection: column1 AS col1, column2 AS col2 - Values: (CAST(Utf8("2021-06-10 17:01:00Z") AS Timestamp(Nanosecond, None)), CAST(Utf8("2004-04-09") AS Date32)) - "# + Projection: t.col1, t.col2 + SubqueryAlias: t + Projection: column1 AS col1, column2 AS col2 + Values: (CAST(Utf8("2021-06-10 17:01:00Z") AS Timestamp(ns)), CAST(Utf8("2004-04-09") AS Date32)) + "# ); } @@ -3151,7 +3151,7 @@ fn select_typed_time_string() { assert_snapshot!( plan, @r#" - Projection: CAST(Utf8("08:09:10.123") AS Time64(Nanosecond)) AS time + Projection: CAST(Utf8("08:09:10.123") AS Time64(ns)) AS time EmptyRelation: rows=1 "# ); @@ -4686,7 +4686,7 @@ fn test_custom_type_plan() -> Result<()> { assert_snapshot!( plan, @r#" - Projection: CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(Nanosecond, None)) + Projection: CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(ns)) EmptyRelation: rows=1 "# ); @@ -4696,7 +4696,7 @@ fn test_custom_type_plan() -> Result<()> { assert_snapshot!( plan, @r#" - Projection: CAST(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(Nanosecond, None)) AS Timestamp(Nanosecond, None)) + Projection: CAST(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(ns)) AS Timestamp(ns)) EmptyRelation: rows=1 "# ); @@ -4708,7 +4708,7 @@ fn test_custom_type_plan() -> Result<()> { assert_snapshot!( plan, @r#" - Projection: make_array(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(Nanosecond, None)), CAST(Utf8("2001-01-02 18:00:00") AS Timestamp(Nanosecond, None))) + Projection: make_array(CAST(Utf8("2001-01-01 18:00:00") AS Timestamp(ns)), CAST(Utf8("2001-01-02 18:00:00") AS Timestamp(ns))) EmptyRelation: rows=1 "# ); diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 43899642a93a..29f0241c8862 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -710,13 +710,13 @@ select query TTT select arrow_typeof(column1), arrow_typeof(column2), arrow_typeof(column3) from arrays; ---- -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) +List(nullable List(nullable Int64)) List(nullable Float64) List(nullable Utf8) # arrays table query ??? @@ -1182,7 +1182,7 @@ select make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)')) query T select arrow_typeof(make_array(make_array(1), arrow_cast(make_array(-1), 'LargeList(Int8)'))); ---- -List(Field { name: "item", data_type: LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable LargeList(nullable Int64)) query ??? @@ -3292,7 +3292,7 @@ select array_concat([arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')], [arrow_cast('3', 'Utf8View')]), arrow_typeof(array_concat([arrow_cast('1', 'Utf8'), arrow_cast('2', 'Utf8')], [arrow_cast('3', 'Utf8View')])); ---- -[1, 2, 3] List(Field { name: "item", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[1, 2, 3] List(nullable Utf8View) # array_concat error query error DataFusion error: Error during planning: Execution error: Function 'array_concat' user-defined coercion failed with "Error during planning: array_concat does not support type Int64" @@ -4585,7 +4585,7 @@ NULL [baz] baz query T SELECT arrow_typeof(make_array(arrow_cast('a', 'Utf8View'), 'b', 'c', 'd')); ---- -List(Field { name: "item", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable Utf8View) # expect a,b,c,d. make_array forces all types to be of a common type (see above) query T @@ -7653,8 +7653,8 @@ CREATE EXTERNAL TABLE fixed_size_list_array STORED AS PARQUET LOCATION '../core/ query T select arrow_typeof(f0) from fixed_size_list_array; ---- -FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2) -FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2) +FixedSizeList(2 x nullable Int64) +FixedSizeList(2 x nullable Int64) query ? select * from fixed_size_list_array; @@ -7683,8 +7683,8 @@ select make_array(arrow_cast(f0, 'List(Int64)')) from fixed_size_list_array query T select arrow_typeof(make_array(arrow_cast(f0, 'List(Int64)'))) from fixed_size_list_array ---- -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable List(nullable Int64)) +List(nullable List(nullable Int64)) query ? select make_array(f0) from fixed_size_list_array @@ -7695,8 +7695,8 @@ select make_array(f0) from fixed_size_list_array query T select arrow_typeof(make_array(f0)) from fixed_size_list_array ---- -List(Field { name: "item", data_type: FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) -List(Field { name: "item", data_type: FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 2), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable FixedSizeList(2 x nullable Int64)) +List(nullable FixedSizeList(2 x nullable Int64)) query ? select array_concat(column1, [7]) from arrays_values_v2; @@ -8275,19 +8275,19 @@ select * from test_create_array_table; query T select arrow_typeof(a) from test_create_array_table; ---- -List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable Int32) query T select arrow_typeof(c) from test_create_array_table; ---- -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable List(nullable Int32)) # Test casting to array types # issue: https://github.com/apache/datafusion/issues/9440 query ??T select [1,2,3]::int[], [['1']]::int[][], arrow_typeof([]::text[]); ---- -[1, 2, 3] [[1]] List(Field { name: "item", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[1, 2, 3] [[1]] List(nullable Utf8View) # test empty arrays return length # issue: https://github.com/apache/datafusion/pull/12459 @@ -8307,8 +8307,8 @@ create table fixed_size_col_table (a int[3]) as values ([1,2,3]), ([4,5,6]); query T select arrow_typeof(a) from fixed_size_col_table; ---- -FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) -FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) +FixedSizeList(3 x nullable Int32) +FixedSizeList(3 x nullable Int32) query ? rowsort SELECT DISTINCT a FROM fixed_size_col_table diff --git a/datafusion/sqllogictest/test_files/arrow_typeof.slt b/datafusion/sqllogictest/test_files/arrow_typeof.slt index 654218531f1d..ac32ef821bc4 100644 --- a/datafusion/sqllogictest/test_files/arrow_typeof.slt +++ b/datafusion/sqllogictest/test_files/arrow_typeof.slt @@ -61,13 +61,13 @@ Decimal128(38, 10) query T SELECT arrow_typeof(now()::timestamp) ---- -Timestamp(Nanosecond, None) +Timestamp(ns) # arrow_typeof_timestamp_utc query T SELECT arrow_typeof(now()) ---- -Timestamp(Nanosecond, Some("+00:00")) +Timestamp(ns, "+00:00") # arrow_typeof_timestamp_date32( query T @@ -98,7 +98,7 @@ SELECT arrow_cast('1') query error DataFusion error: Execution error: arrow_cast requires its second argument to be a non\-empty constant string SELECT arrow_cast('1', 43) -query error Error unrecognized word: unknown +query error DataFusion error: Execution error: Unsupported type 'unknown'\. Must be a supported arrow type name such as 'Int32' or 'Timestamp\(ns\)'\. Error unknown token: unknown SELECT arrow_cast('1', 'unknown') # Round Trip tests: @@ -130,7 +130,7 @@ SELECT arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Nanosecond, Some("+08:00"))')) as col_tstz_ns, arrow_typeof(arrow_cast('foo', 'Dictionary(Int32, Utf8)')) as col_dict ---- -Int8 Int16 Int32 Int64 UInt8 UInt16 UInt32 UInt64 Float16 Float32 Float64 Utf8 LargeUtf8 Utf8View Binary LargeBinary Timestamp(Second, None) Timestamp(Millisecond, None) Timestamp(Microsecond, None) Timestamp(Nanosecond, None) Timestamp(Second, Some("+08:00")) Timestamp(Millisecond, Some("+08:00")) Timestamp(Microsecond, Some("+08:00")) Timestamp(Nanosecond, Some("+08:00")) Dictionary(Int32, Utf8) +Int8 Int16 Int32 Int64 UInt8 UInt16 UInt32 UInt64 Float16 Float32 Float64 Utf8 LargeUtf8 Utf8View Binary LargeBinary Timestamp(s) Timestamp(ms) Timestamp(µs) Timestamp(ns) Timestamp(s, "+08:00") Timestamp(ms, "+08:00") Timestamp(µs, "+08:00") Timestamp(ns, "+08:00") Dictionary(Int32, Utf8) @@ -255,7 +255,7 @@ SELECT arrow_typeof(col_ts_ns) FROM foo; ---- -Timestamp(Second, None) Timestamp(Millisecond, None) Timestamp(Microsecond, None) Timestamp(Nanosecond, None) +Timestamp(s) Timestamp(ms) Timestamp(µs) Timestamp(ns) statement ok @@ -316,7 +316,7 @@ select arrow_cast(interval '30 minutes', 'Duration(Second)'); ---- 0 days 0 hours 30 mins 0 secs -query error DataFusion error: This feature is not implemented: Unsupported CAST from Utf8 to Duration\(Second\) +query error DataFusion error: This feature is not implemented: Unsupported CAST from Utf8 to Duration\(s\) select arrow_cast('30 minutes', 'Duration(Second)'); @@ -357,12 +357,12 @@ select arrow_cast(make_array(1, 2, 3), 'List(Int64)'); query T select arrow_typeof(arrow_cast(make_array(1, 2, 3), 'List(Int64)')); ---- -List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable Int64) query T select arrow_typeof(arrow_cast(arrow_cast(make_array([1, 2, 3]), 'LargeList(LargeList(Int64))'), 'List(List(Int64))')); ---- -List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable List(nullable Int64)) ## LargeList @@ -380,12 +380,12 @@ select arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'); query T select arrow_typeof(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')); ---- -LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +LargeList(nullable Int64) query T select arrow_typeof(arrow_cast(make_array([1, 2, 3]), 'LargeList(LargeList(Int64))')); ---- -LargeList(Field { name: "item", data_type: LargeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +LargeList(nullable LargeList(nullable Int64)) ## FixedSizeList @@ -417,7 +417,7 @@ select arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'); query T select arrow_typeof(arrow_cast(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 'FixedSizeList(3, Int64)')); ---- -FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) +FixedSizeList(3 x nullable Int64) query ? select arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)'); diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt index 2f9173d2dcbd..352300e753a7 100644 --- a/datafusion/sqllogictest/test_files/case.slt +++ b/datafusion/sqllogictest/test_files/case.slt @@ -594,4 +594,4 @@ query I SELECT CASE WHEN a = 'a' THEN 0 WHEN a = 'b' THEN 1 ELSE 2 END FROM (VALUES (NULL), ('z')) t(a) ---- 2 -2 \ No newline at end of file +2 diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt index 9740bade5e27..e34a601851d7 100644 --- a/datafusion/sqllogictest/test_files/coalesce.slt +++ b/datafusion/sqllogictest/test_files/coalesce.slt @@ -199,14 +199,14 @@ select coalesce(array[1, 2], array[3, 4]), arrow_typeof(coalesce(array[1, 2], array[3, 4])); ---- -[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[1, 2] List(nullable Int64) query ?T select coalesce(null, array[3, 4]), arrow_typeof(coalesce(array[1, 2], array[3, 4])); ---- -[3, 4] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[3, 4] List(nullable Int64) # coalesce with array query ?T @@ -214,7 +214,7 @@ select coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')]), arrow_typeof(coalesce(array[1, 2], array[arrow_cast(3, 'Int32'), arrow_cast(4, 'Int32')])); ---- -[1, 2] List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[1, 2] List(nullable Int64) # test dict(int32, utf8) statement ok diff --git a/datafusion/sqllogictest/test_files/count_star_rule.slt b/datafusion/sqllogictest/test_files/count_star_rule.slt index 826742267290..b78c021a565c 100644 --- a/datafusion/sqllogictest/test_files/count_star_rule.slt +++ b/datafusion/sqllogictest/test_files/count_star_rule.slt @@ -88,7 +88,7 @@ logical_plan 03)----TableScan: t1 projection=[a] physical_plan 01)ProjectionExec: expr=[a@0 as a, count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as count_a] -02)--WindowAggExec: wdw=[count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +02)--WindowAggExec: wdw=[count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) PARTITION BY [t1.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 03)----SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] diff --git a/datafusion/sqllogictest/test_files/current_time_timezone.slt b/datafusion/sqllogictest/test_files/current_time_timezone.slt index a9e27bd4045f..c80c4b51d5ac 100644 --- a/datafusion/sqllogictest/test_files/current_time_timezone.slt +++ b/datafusion/sqllogictest/test_files/current_time_timezone.slt @@ -29,7 +29,7 @@ true query T SELECT arrow_typeof(current_time()); ---- -Time64(Nanosecond) +Time64(ns) # Test 3: Set timezone to +08:00 and verify current_time is still stable statement ok @@ -44,7 +44,7 @@ true query T SELECT arrow_typeof(current_time()); ---- -Time64(Nanosecond) +Time64(ns) # Test 5: Test with negative offset timezone statement ok diff --git a/datafusion/sqllogictest/test_files/dates.slt b/datafusion/sqllogictest/test_files/dates.slt index 2e91a0363db0..a309be114809 100644 --- a/datafusion/sqllogictest/test_files/dates.slt +++ b/datafusion/sqllogictest/test_files/dates.slt @@ -85,9 +85,14 @@ g h ## Plan error when compare Utf8 and timestamp in where clause -statement error DataFusion error: type_coercion\ncaused by\nError during planning: Cannot coerce arithmetic expression Timestamp\(Nanosecond, Some\("\+00:00"\)\) \+ Utf8 to valid types +statement error select i_item_desc from test where d3_date > now() + '5 days'; +---- +DataFusion error: type_coercion +caused by +Error during planning: Cannot coerce arithmetic expression Timestamp(ns, "+00:00") + Utf8 to valid types + # DATE minus DATE # https://github.com/apache/arrow-rs/issues/4383 diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index bc6cbfab0cae..64c78284594f 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -867,7 +867,7 @@ query TTTTTT show columns FROM table_with_pk; ---- datafusion public table_with_pk sn Int32 NO -datafusion public table_with_pk ts Timestamp(Nanosecond, Some("+00:00")) NO +datafusion public table_with_pk ts Timestamp(ns, "+00:00") NO datafusion public table_with_pk currency Utf8View NO datafusion public table_with_pk amount Float32 YES diff --git a/datafusion/sqllogictest/test_files/describe.slt b/datafusion/sqllogictest/test_files/describe.slt index 4c184c04d128..88347965c67a 100644 --- a/datafusion/sqllogictest/test_files/describe.slt +++ b/datafusion/sqllogictest/test_files/describe.slt @@ -83,7 +83,7 @@ float_col Float32 YES double_col Float64 YES date_string_col Utf8View YES string_col Utf8View YES -timestamp_col Timestamp(Nanosecond, None) YES +timestamp_col Timestamp(ns) YES year Int32 YES month Int32 YES diff --git a/datafusion/sqllogictest/test_files/dictionary.slt b/datafusion/sqllogictest/test_files/dictionary.slt index 9e8a39494095..fd9a7fb9ce44 100644 --- a/datafusion/sqllogictest/test_files/dictionary.slt +++ b/datafusion/sqllogictest/test_files/dictionary.slt @@ -85,7 +85,7 @@ f1 Float64 YES f2 Utf8 YES f3 Utf8 YES f4 Float64 YES -time Timestamp(Nanosecond, None) YES +time Timestamp(ns) YES # in list with dictionary input query BBB @@ -157,7 +157,7 @@ DESCRIBE m2; type Dictionary(Int32, Utf8) YES tag_id Dictionary(Int32, Utf8) YES f5 Float64 YES -time Timestamp(Nanosecond, None) YES +time Timestamp(ns) YES query I select count(*) from m1 where tag_id = '1000' and time < '2024-01-03T14:46:35+01:00'; diff --git a/datafusion/sqllogictest/test_files/expr/date_part.slt b/datafusion/sqllogictest/test_files/expr/date_part.slt index 64f16f72421a..bee8602d80bd 100644 --- a/datafusion/sqllogictest/test_files/expr/date_part.slt +++ b/datafusion/sqllogictest/test_files/expr/date_part.slt @@ -1005,10 +1005,10 @@ SELECT extract(day from arrow_cast(864000, 'Duration(Second)')) ---- 10 -query error DataFusion error: Arrow error: Compute error: Month does not support: Duration\(Second\) +query error DataFusion error: Arrow error: Compute error: Month does not support: Duration\(s\) SELECT extract(month from arrow_cast(864000, 'Duration(Second)')) -query error DataFusion error: Arrow error: Compute error: Year does not support: Duration\(Second\) +query error DataFusion error: Arrow error: Compute error: Year does not support: Duration\(s\) SELECT extract(year from arrow_cast(864000, 'Duration(Second)')) query I diff --git a/datafusion/sqllogictest/test_files/float16.slt b/datafusion/sqllogictest/test_files/float16.slt index 5e59c730f078..699eb81844a4 100644 --- a/datafusion/sqllogictest/test_files/float16.slt +++ b/datafusion/sqllogictest/test_files/float16.slt @@ -51,13 +51,14 @@ NULL NULL NULL NULL NULL NULL NaN NaN NaN NaN NaN NaN # Try coercing with literal NULL -query error +query R select column1 + NULL from float16s; ---- -DataFusion error: type_coercion -caused by -Error during planning: Cannot automatically convert Null to Float16 - +NULL +NULL +NULL +NULL +NULL # Test coercions with equality query BBBBBB @@ -78,11 +79,14 @@ false false false false false false # Try coercing with literal NULL -query error +query B select column1 = NULL from float16s; ---- -DataFusion error: Error during planning: Cannot infer common argument type for comparison operation Float16 = Null - +NULL +NULL +NULL +NULL +NULL # Cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index b72f73d44698..08636b482e38 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -3646,7 +3646,7 @@ physical_plan 07)------------AggregateExec: mode=Partial, gby=[sn@2 as sn, zip_code@0 as zip_code, country@1 as country, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum_amount@6 as sum_amount], aggr=[] 08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 09)----------------ProjectionExec: expr=[zip_code@0 as zip_code, country@1 as country, sn@2 as sn, ts@3 as ts, currency@4 as currency, amount@5 as amount, sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@6 as sum_amount] -10)------------------BoundedWindowAggExec: wdw=[sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +10)------------------BoundedWindowAggExec: wdw=[sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(l.amount) ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Float64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 11)--------------------DataSourceExec: partitions=1, partition_sizes=[2] @@ -3943,7 +3943,7 @@ physical_plan 04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true 06)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n] -07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 08)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true # reset partition number to 8. @@ -4065,7 +4065,7 @@ logical_plan 05)--------TableScan: multiple_ordered_table_with_pk projection=[b, c, d] physical_plan 01)ProjectionExec: expr=[c@0 as c, sum1@2 as sum1, sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as sumb] -02)--WindowAggExec: wdw=[sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +02)--WindowAggExec: wdw=[sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(multiple_ordered_table_with_pk.b) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 03)----ProjectionExec: expr=[c@0 as c, b@1 as b, sum(multiple_ordered_table_with_pk.d)@2 as sum1] 04)------AggregateExec: mode=Single, gby=[c@1 as c, b@0 as b], aggr=[sum(multiple_ordered_table_with_pk.d)], ordering_mode=PartiallySorted([0]) 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, d], output_ordering=[c@1 ASC NULLS LAST], constraints=[PrimaryKey([3])], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/information_schema_columns.slt b/datafusion/sqllogictest/test_files/information_schema_columns.slt index d348a764fa85..c733b3baa7a4 100644 --- a/datafusion/sqllogictest/test_files/information_schema_columns.slt +++ b/datafusion/sqllogictest/test_files/information_schema_columns.slt @@ -42,7 +42,7 @@ my_catalog my_schema table_with_many_types float64_col 1 NULL YES Float64 NULL N my_catalog my_schema table_with_many_types int32_col 0 NULL NO Int32 NULL NULL 32 2 NULL NULL NULL my_catalog my_schema table_with_many_types large_binary_col 5 NULL NO LargeBinary NULL 9223372036854775807 NULL NULL NULL NULL NULL my_catalog my_schema table_with_many_types large_utf8_col 3 NULL NO LargeUtf8 NULL 9223372036854775807 NULL NULL NULL NULL NULL -my_catalog my_schema table_with_many_types timestamp_nanos 6 NULL NO Timestamp(Nanosecond, None) NULL NULL NULL NULL NULL NULL NULL +my_catalog my_schema table_with_many_types timestamp_nanos 6 NULL NO Timestamp(ns) NULL NULL NULL NULL NULL NULL NULL my_catalog my_schema table_with_many_types utf8_col 2 NULL YES Utf8 NULL 2147483647 NULL NULL NULL NULL NULL # Cleanup diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index 9a3c959884aa..b8b2a7c37276 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -68,7 +68,7 @@ physical_plan 02)--ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@0 as field1, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@1 as field2] 03)----SortPreservingMergeExec: [c1@2 ASC NULLS LAST] 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, c1@0 as c1] -05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8 @@ -128,7 +128,7 @@ physical_plan 01)DataSinkExec: sink=MemoryTable (partitions=1) 02)--CoalescePartitionsExec 03)----ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as field1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as field2] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true] 06)----------CoalesceBatchesExec: target_batch_size=8192 07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8 @@ -179,7 +179,7 @@ physical_plan 02)--ProjectionExec: expr=[a1@0 as a1, a2@1 as a2] 03)----SortPreservingMergeExec: [c1@2 ASC NULLS LAST] 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as a1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as a2, c1@0 as c1] -05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8 diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt index 075256ae4b92..dc8ef59bbedc 100644 --- a/datafusion/sqllogictest/test_files/insert_to_external.slt +++ b/datafusion/sqllogictest/test_files/insert_to_external.slt @@ -422,7 +422,7 @@ physical_plan 02)--ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@0 as field1, count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@1 as field2] 03)----SortPreservingMergeExec: [c1@2 ASC NULLS LAST] 04)------ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, c1@0 as c1] -05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8 @@ -483,7 +483,7 @@ physical_plan 01)DataSinkExec: sink=ParquetSink(file_groups=[]) 02)--CoalescePartitionsExec 03)----ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as field1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as field2] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 05)--------SortExec: expr=[c1@0 ASC NULLS LAST, c9@2 ASC NULLS LAST], preserve_partitioning=[true] 06)----------CoalesceBatchesExec: target_batch_size=8192 07)------------RepartitionExec: partitioning=Hash([c1@0], 8), input_partitions=8 diff --git a/datafusion/sqllogictest/test_files/interval.slt b/datafusion/sqllogictest/test_files/interval.slt index 1ef3048ddc66..8c5a4382ed2c 100644 --- a/datafusion/sqllogictest/test_files/interval.slt +++ b/datafusion/sqllogictest/test_files/interval.slt @@ -444,7 +444,7 @@ select '1 month'::interval + '1980-01-01T12:00:00'::timestamp; query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Date32 to valid types select '1 month'::interval - '1980-01-01'::date; -query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types +query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types select '1 month'::interval - '1980-01-01T12:00:00'::timestamp; # interval (array) + date / timestamp (array) @@ -466,7 +466,7 @@ select i + ts from t; query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Date32 to valid types select i - d from t; -query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types +query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types select i - ts from t; # interval unit abreiviation and plurals @@ -530,7 +530,7 @@ SELECT interval '5 day' hour query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Date32 to valid types select '1 month'::interval - d from t; -query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types +query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types select '1 month'::interval - ts from t; # interval + date diff --git a/datafusion/sqllogictest/test_files/join_lists.slt b/datafusion/sqllogictest/test_files/join_lists.slt index c07bd85551f3..0a48a4f9203e 100644 --- a/datafusion/sqllogictest/test_files/join_lists.slt +++ b/datafusion/sqllogictest/test_files/join_lists.slt @@ -60,4 +60,3 @@ DROP TABLE categories_raw; statement ok DROP TABLE places; - diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 0174321dd831..4bdf2e5da963 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -3199,7 +3199,7 @@ physical_plan 04)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@5 ASC NULLS LAST 05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 06)----------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -07)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true 09)----CoalesceBatchesExec: target_batch_size=2 10)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST @@ -3237,7 +3237,7 @@ physical_plan 08)------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, rn1@5 ASC NULLS LAST 09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 10)----------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -11)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +11)------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 12)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true statement ok @@ -3276,14 +3276,14 @@ physical_plan 06)----------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2 07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 08)--------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -09)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +09)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true 11)------SortExec: expr=[a@1 ASC], preserve_partitioning=[true] 12)--------CoalesceBatchesExec: target_batch_size=2 13)----------RepartitionExec: partitioning=Hash([a@1], 2), input_partitions=2 14)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 15)--------------ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -16)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +16)----------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 17)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true statement ok @@ -3318,7 +3318,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@1, a@1)] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true 04)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true # hash join should propagate ordering equivalence of the right side for RIGHT ANTI join. @@ -3345,7 +3345,7 @@ physical_plan 02)--HashJoinExec: mode=CollectLeft, join_type=RightAnti, on=[(a@0, a@1)] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a], output_ordering=[a@0 ASC], file_type=csv, has_header=true 04)----ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@5 as rn1] -05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +05)------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 06)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true # Test ordering preservation for RIGHT join @@ -3441,7 +3441,7 @@ physical_plan 04)------HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(d@1, d@1)], filter=CAST(a@0 AS Int64) >= CAST(a@1 AS Int64) - 10, projection=[a@0, d@1, row_n@4] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true 06)--------ProjectionExec: expr=[a@0 as a, d@1 as d, row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as row_n] -07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +07)----------BoundedWindowAggExec: wdw=[row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [r.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 08)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, d], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true # run query above in multiple partitions @@ -4036,12 +4036,12 @@ logical_plan 09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)))|depth=1] structs[] 10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t1.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int))) 11)----------------EmptyRelation: rows=1 -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "t1" }), name: "t1_int" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true }, Column { relation: Some(Bare { table: "t1" }), name: "t1_int" }) # Test CROSS JOIN LATERAL syntax (execution) # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "t1_int", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}, Column \{ relation: Some\(Bare \{ table: "t1" \}\), name: "t1_int" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "t1_int", data_type: UInt32, nullable: true \}, Column \{ relation: Some\(Bare \{ table: "t1" \}\), name: "t1_int" \}\) select t1_id, t1_name, i from join_t1 t1 cross join lateral (select * from unnest(generate_series(1, t1_int))) as series(i); @@ -4061,12 +4061,12 @@ logical_plan 09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)))|depth=1] structs[] 10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t2.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int))) 11)----------------EmptyRelation: rows=1 -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "t2" }), name: "t1_int" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "t1_int", data_type: UInt32, nullable: true }, Column { relation: Some(Bare { table: "t2" }), name: "t1_int" }) # Test INNER JOIN LATERAL syntax (execution) # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "t1_int", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}, Column \{ relation: Some\(Bare \{ table: "t2" \}\), name: "t1_int" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "t1_int", data_type: UInt32, nullable: true \}, Column \{ relation: Some\(Bare \{ table: "t2" \}\), name: "t1_int" \}\) select t1_id, t1_name, i from join_t1 t2 inner join lateral (select * from unnest(generate_series(1, t1_int))) as series(i) on(t1_id > i); # Test RIGHT JOIN LATERAL syntax (unsupported) @@ -4671,7 +4671,7 @@ logical_plan 05)------Subquery: 06)--------Filter: outer_ref(j1.j1_id) < j2.j2_id 07)----------TableScan: j2 projection=[j2_string, j2_id] -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) query TT explain SELECT * FROM j1 JOIN (j2 JOIN j3 ON(j2_id = j3_id - 2)) ON(j1_id = j2_id), LATERAL (SELECT * FROM j3 WHERE j3_string = j2_string) as j4 @@ -4687,7 +4687,7 @@ logical_plan 08)----Subquery: 09)------Filter: j3.j3_string = outer_ref(j2.j2_string) 10)--------TableScan: j3 projection=[j3_string, j3_id] -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j2_string", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "j2" }), name: "j2_string" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j2_string", data_type: Utf8View, nullable: true }, Column { relation: Some(Bare { table: "j2" }), name: "j2_string" }) query TT explain SELECT * FROM j1, LATERAL (SELECT * FROM j1, LATERAL (SELECT * FROM j2 WHERE j1_id = j2_id) as j2) as j2; @@ -4703,7 +4703,7 @@ logical_plan 08)----------Subquery: 09)------------Filter: outer_ref(j1.j1_id) = j2.j2_id 10)--------------TableScan: j2 projection=[j2_string, j2_id] -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) query TT explain SELECT j1_string, j2_string FROM j1 LEFT JOIN LATERAL (SELECT * FROM j2 WHERE j1_id < j2_id) AS j2 ON(true); @@ -4716,7 +4716,7 @@ logical_plan 05)------Subquery: 06)--------Filter: outer_ref(j1.j1_id) < j2.j2_id 07)----------TableScan: j2 projection=[j2_string, j2_id] -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) query TT explain SELECT * FROM j1, (j2 LEFT JOIN LATERAL (SELECT * FROM j3 WHERE j1_id + j2_id = j3_id) AS j3 ON(true)); @@ -4730,7 +4730,7 @@ logical_plan 06)------Subquery: 07)--------Filter: outer_ref(j1.j1_id) + outer_ref(j2.j2_id) = j3.j3_id 08)----------TableScan: j3 projection=[j3_string, j3_id] -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "j1_id", data_type: Int32, nullable: true }, Column { relation: Some(Bare { table: "j1" }), name: "j1_id" }) query TT explain SELECT * FROM j1, LATERAL (SELECT 1) AS j2; diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt index 4f1e5ef39a00..fc21638b3f3c 100644 --- a/datafusion/sqllogictest/test_files/map.slt +++ b/datafusion/sqllogictest/test_files/map.slt @@ -43,8 +43,8 @@ LOCATION '../core/tests/data/parquet_map.parquet'; query TTT describe data; ---- -ints Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO -strings Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO +ints Map("entries": Struct("key": Utf8, "value": Int64), unsorted) NO +strings Map("entries": Struct("key": Utf8, "value": Utf8), unsorted) NO timestamp Utf8View NO query ??T diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 11942108ab2b..c21f3129d4ee 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -268,7 +268,7 @@ FROM ( ) t GROUP BY 1 ---- -Timestamp(Millisecond, Some("UTC")) 2014-08-27T14:00:00Z 131072 +Timestamp(ms, "UTC") 2014-08-27T14:00:00Z 131072 # Test config listing_table_ignore_subdirectory: @@ -689,7 +689,7 @@ LOCATION '../../parquet-testing/data/int96_from_spark.parquet'; query TTT describe int96_from_spark ---- -a Timestamp(Nanosecond, None) YES +a Timestamp(ns) YES # Note that the values are read as nanosecond precision query P @@ -718,7 +718,7 @@ LOCATION '../../parquet-testing/data/int96_from_spark.parquet'; query TTT describe int96_from_spark; ---- -a Timestamp(Millisecond, None) YES +a Timestamp(ms) YES # Per https://github.com/apache/parquet-testing/blob/6e851ddd768d6af741c7b15dc594874399fc3cff/data/int96_from_spark.md?plain=1#L37 # these values should be @@ -742,7 +742,7 @@ select * from int96_from_spark 9999-12-31T03:00:00 2024-12-30T23:00:00 NULL -ERROR: Cast error: Failed to convert -9357363680509551 to datetime for Timestamp(Millisecond, None) +ERROR: Cast error: Failed to convert -9357363680509551 to datetime for Timestamp(ms) # Cleanup / reset default setting statement ok diff --git a/datafusion/sqllogictest/test_files/pwmj.slt b/datafusion/sqllogictest/test_files/pwmj.slt index 0014b3c545f2..eafa4d0ba394 100644 --- a/datafusion/sqllogictest/test_files/pwmj.slt +++ b/datafusion/sqllogictest/test_files/pwmj.slt @@ -158,7 +158,7 @@ ORDER BY 1,2; 33 44 44 55 -query TT +query TT EXPLAIN SELECT t1.t1_id, t2.t2_id FROM join_t1 t1 diff --git a/datafusion/sqllogictest/test_files/qualify.slt b/datafusion/sqllogictest/test_files/qualify.slt index d53b56ce58de..366d65df6792 100644 --- a/datafusion/sqllogictest/test_files/qualify.slt +++ b/datafusion/sqllogictest/test_files/qualify.slt @@ -275,7 +275,7 @@ physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 > 1, projection=[id@0, name@1] -04)------WindowAggExec: wdw=[count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +04)------WindowAggExec: wdw=[count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "count(Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 05)--------DataSourceExec: partitions=1, partition_sizes=[1] # plan row_number() @@ -293,7 +293,7 @@ physical_plan 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@0 > 1 04)------ProjectionExec: expr=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] -05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[false] 07)------------DataSourceExec: partitions=1, partition_sizes=[1] @@ -321,7 +321,7 @@ physical_plan 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 > Some(60000000000),14,6 04)------ProjectionExec: expr=[dept@0 as dept, avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] -05)--------WindowAggExec: wdw=[avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Decimal128(14, 6), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +05)--------WindowAggExec: wdw=[avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "avg(users.salary) PARTITION BY [users.dept] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Decimal128(14, 6), nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 06)----------SortExec: expr=[dept@0 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([dept@0], 4), input_partitions=4 @@ -358,7 +358,7 @@ physical_plan 04)------CoalesceBatchesExec: target_batch_size=8192 05)--------FilterExec: rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 = 1, projection=[dept@0, sum(users.salary)@1] 06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -07)------------BoundedWindowAggExec: wdw=[rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [sum(users.salary) DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 08)--------------SortPreservingMergeExec: [sum(users.salary)@1 DESC] 09)----------------SortExec: expr=[sum(users.salary)@1 DESC], preserve_partitioning=[true] 10)------------------AggregateExec: mode=FinalPartitioned, gby=[dept@0 as dept], aggr=[sum(users.salary)] diff --git a/datafusion/sqllogictest/test_files/spark/array/shuffle.slt b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt index cb3c77cac8fb..7614caef666b 100644 --- a/datafusion/sqllogictest/test_files/spark/array/shuffle.slt +++ b/datafusion/sqllogictest/test_files/spark/array/shuffle.slt @@ -109,5 +109,3 @@ DROP TABLE test_shuffle_list_types; statement ok DROP TABLE test_shuffle_fixed_size; - - diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index 95eeffc31903..0e3c5145d156 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -53,9 +53,9 @@ select * from struct_values; query TT select arrow_typeof(s1), arrow_typeof(s2) from struct_values; ---- -Struct(c0 Int32) Struct(a Int32, b Utf8View) -Struct(c0 Int32) Struct(a Int32, b Utf8View) -Struct(c0 Int32) Struct(a Int32, b Utf8View) +Struct("c0": nullable Int32) Struct("a": nullable Int32, "b": nullable Utf8View) +Struct("c0": nullable Int32) Struct("a": nullable Int32, "b": nullable Utf8View) +Struct("c0": nullable Int32) Struct("a": nullable Int32, "b": nullable Utf8View) # struct[i] @@ -229,12 +229,12 @@ select named_struct('field_a', 1, 'field_b', 2); query T select arrow_typeof(named_struct('first', 1, 'second', 2, 'third', 3)); ---- -Struct(first Int64, second Int64, third Int64) +Struct("first": nullable Int64, "second": nullable Int64, "third": nullable Int64) query T select arrow_typeof({'first': 1, 'second': 2, 'third': 3}); ---- -Struct(first Int64, second Int64, third Int64) +Struct("first": nullable Int64, "second": nullable Int64, "third": nullable Int64) # test nested struct literal query ? @@ -413,7 +413,7 @@ create table t(a struct, b struct) as valu query T select arrow_typeof([a, b]) from t; ---- -List(Field { name: "item", data_type: Struct([Field { name: "r", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c", data_type: Float32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable Struct("r": nullable Utf8View, "c": nullable Float32)) query ? select [a, b] from t; @@ -464,12 +464,12 @@ select * from t; query T select arrow_typeof(c1) from t; ---- -Struct(r Utf8View, b Int32) +Struct("r": nullable Utf8View, "b": nullable Int32) query T select arrow_typeof(c2) from t; ---- -Struct(r Utf8View, b Float32) +Struct("r": nullable Utf8View, "b": nullable Float32) statement ok drop table t; @@ -486,8 +486,8 @@ select * from t; query T select arrow_typeof(column1) from t; ---- -Struct(r Utf8, c Float64) -Struct(r Utf8, c Float64) +Struct("r": nullable Utf8, "c": nullable Float64) +Struct("r": nullable Utf8, "c": nullable Float64) statement ok drop table t; @@ -519,9 +519,9 @@ select coalesce(s1) from t; query T select arrow_typeof(coalesce(s1, s2)) from t; ---- -Struct(a Float32, b Utf8View) -Struct(a Float32, b Utf8View) -Struct(a Float32, b Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) statement ok drop table t; @@ -546,9 +546,9 @@ select coalesce(s1, s2) from t; query T select arrow_typeof(coalesce(s1, s2)) from t; ---- -Struct(a Float32, b Utf8View) -Struct(a Float32, b Utf8View) -Struct(a Float32, b Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) +Struct("a": nullable Float32, "b": nullable Utf8View) statement ok drop table t; @@ -583,7 +583,7 @@ create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as valu query T select arrow_typeof([a, b]) from t; ---- -List(Field { name: "item", data_type: Struct([Field { name: "r", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "c", data_type: Float32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +List(nullable Struct("r": nullable Utf8View, "c": nullable Float32)) statement ok drop table t; @@ -606,13 +606,13 @@ create table t(a struct(r varchar, c int, g float), b struct(r varchar, c float, query T select arrow_typeof(a) from t; ---- -Struct(r Utf8View, c Int32, g Float32) +Struct("r": nullable Utf8View, "c": nullable Int32, "g": nullable Float32) # type of each column should not coerced but perserve as it is query T select arrow_typeof(b) from t; ---- -Struct(r Utf8View, c Float32, g Int32) +Struct("r": nullable Utf8View, "c": nullable Float32, "g": nullable Int32) statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/subquery_sort.slt b/datafusion/sqllogictest/test_files/subquery_sort.slt index 1e5a3c8f526a..ea7addd8e36f 100644 --- a/datafusion/sqllogictest/test_files/subquery_sort.slt +++ b/datafusion/sqllogictest/test_files/subquery_sort.slt @@ -100,7 +100,7 @@ physical_plan 01)ProjectionExec: expr=[c1@0 as c1, r@1 as r] 02)--SortExec: TopK(fetch=2), expr=[c1@0 ASC NULLS LAST, c3@2 ASC NULLS LAST, c9@3 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[c1@0 as c1, rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as r, c3@1 as c3, c9@2 as c9] -04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [sink_table.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c1@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3, c9], file_type=csv, has_header=true @@ -126,7 +126,7 @@ physical_plan 01)ProjectionExec: expr=[c1@0 as c1, r@1 as r] 02)--SortExec: TopK(fetch=2), expr=[c1@0 ASC NULLS LAST, c3@2 ASC NULLS LAST, c9@3 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[c1@0 as c1, rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as r, c3@1 as c3, c9@2 as c9] -04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c1@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: partitions=1, partition_sizes=[1] diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 6fe9995c7b67..84dd7098a2ee 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -79,7 +79,7 @@ SET TIME ZONE = '+08' query T select arrow_typeof(now()); ---- -Timestamp(Nanosecond, Some("+08")) +Timestamp(ns, "+08") query I SELECT count(1) result FROM (SELECT now() as n) a WHERE n > '2000-01-01'::date; @@ -691,11 +691,11 @@ select ---- 08:09:10.123456789 13:14:15.123456 13:14:15.123 13:14:15 -query error Cannot cast string 'not a time' to value of Time64\(Nanosecond\) type +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'not a time' to value of Time64\(ns\) type SELECT TIME 'not a time' as time; # invalid time -query error Cannot cast string '24:01:02' to value of Time64\(Nanosecond\) type +query error DataFusion error: Arrow error: Cast error: Cannot cast string '24:01:02' to value of Time64\(ns\) type SELECT TIME '24:01:02' as time; # invalid timezone @@ -908,7 +908,7 @@ from (values query T SELECT arrow_typeof(DATE_BIN(INTERVAL '15 minute', to_timestamp_micros(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z')) ---- -Timestamp(Microsecond, None) +Timestamp(µs) query P SELECT DATE_BIN(INTERVAL '15 minute', to_timestamp_millis(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z') @@ -926,7 +926,7 @@ from (values query T SELECT arrow_typeof(DATE_BIN(INTERVAL '15 minute', to_timestamp_millis(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z')) ---- -Timestamp(Millisecond, None) +Timestamp(ms) query P SELECT DATE_BIN(INTERVAL '15 minute', to_timestamp_seconds(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z') @@ -944,7 +944,7 @@ from (values query T SELECT arrow_typeof(DATE_BIN(INTERVAL '15 minute', to_timestamp_seconds(TIMESTAMP '2022-08-03 14:38:50Z'), TIMESTAMP '1970-01-01 00:00:00Z')) ---- -Timestamp(Second, None) +Timestamp(s) # month interval with INTERVAL keyword in date_bin with default start time query P @@ -1540,24 +1540,24 @@ from timestamp_utc; -- have to convert to utc prior to converting to berlin query PT select ts, arrow_typeof(ts) from timestamp_utc order by ts; ---- -2024-10-27T00:00:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T00:30:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T01:30:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T02:00:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T02:30:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T03:00:00Z Timestamp(Nanosecond, Some("UTC")) -2024-10-27T03:30:00Z Timestamp(Nanosecond, Some("UTC")) +2024-10-27T00:00:00Z Timestamp(ns, "UTC") +2024-10-27T00:30:00Z Timestamp(ns, "UTC") +2024-10-27T01:30:00Z Timestamp(ns, "UTC") +2024-10-27T02:00:00Z Timestamp(ns, "UTC") +2024-10-27T02:30:00Z Timestamp(ns, "UTC") +2024-10-27T03:00:00Z Timestamp(ns, "UTC") +2024-10-27T03:30:00Z Timestamp(ns, "UTC") query PT select ts, arrow_typeof(ts) from timestamp_berlin order by ts; ---- -2024-10-27T02:00:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T02:30:00+02:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T02:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T03:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T03:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T04:00:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) -2024-10-27T04:30:00+01:00 Timestamp(Nanosecond, Some("Europe/Berlin")) +2024-10-27T02:00:00+02:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T02:30:00+02:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T02:30:00+01:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T03:00:00+01:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T03:30:00+01:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T04:00:00+01:00 Timestamp(ns, "Europe/Berlin") +2024-10-27T04:30:00+01:00 Timestamp(ns, "Europe/Berlin") # date trunc in utc with DST query PPPP @@ -1624,24 +1624,24 @@ from timestamp_utc; -- have to convert to utc prior to converting to Sau Paulo query PT select ts, arrow_typeof(ts) from timestamp_utc order by ts; ---- -2018-11-04T01:00:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T01:30:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T02:30:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T03:00:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T03:30:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T04:00:00Z Timestamp(Nanosecond, Some("UTC")) -2018-11-04T04:30:00Z Timestamp(Nanosecond, Some("UTC")) +2018-11-04T01:00:00Z Timestamp(ns, "UTC") +2018-11-04T01:30:00Z Timestamp(ns, "UTC") +2018-11-04T02:30:00Z Timestamp(ns, "UTC") +2018-11-04T03:00:00Z Timestamp(ns, "UTC") +2018-11-04T03:30:00Z Timestamp(ns, "UTC") +2018-11-04T04:00:00Z Timestamp(ns, "UTC") +2018-11-04T04:30:00Z Timestamp(ns, "UTC") query PT select ts, arrow_typeof(ts) from timestamp_sao_paulo order by ts; ---- -2018-11-03T22:00:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-03T22:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-03T23:30:00-03:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-04T01:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-04T01:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-04T02:00:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) -2018-11-04T02:30:00-02:00 Timestamp(Nanosecond, Some("America/Sao_Paulo")) +2018-11-03T22:00:00-03:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-03T22:30:00-03:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-03T23:30:00-03:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-04T01:00:00-02:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-04T01:30:00-02:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-04T02:00:00-02:00 Timestamp(ns, "America/Sao_Paulo") +2018-11-04T02:30:00-02:00 Timestamp(ns, "America/Sao_Paulo") # date trunc in utc with DST query PPPP @@ -1797,7 +1797,7 @@ SELECT ts1 + i FROM foo; 2003-07-12T01:31:15.000123463 # Timestamp + Timestamp => error -query error DataFusion error: Error during planning: Cannot get result type for temporal operation Timestamp\(Nanosecond, None\) \+ Timestamp\(Nanosecond, None\): Invalid argument error: Invalid timestamp arithmetic operation: Timestamp\(Nanosecond, None\) \+ Timestamp\(Nanosecond, None\) +query error DataFusion error: Error during planning: Cannot get result type for temporal operation Timestamp\(ns\) \+ Timestamp\(ns\): Invalid argument error: Invalid timestamp arithmetic operation: Timestamp\(ns\) \+ Timestamp\(ns\) SELECT ts1 + ts2 FROM foo; @@ -2256,7 +2256,7 @@ SET TIME ZONE = '+00' query T SELECT arrow_typeof(time) FROM foo LIMIT 1 ---- -Timestamp(Nanosecond, Some("+05:00")) +Timestamp(ns, "+05:00") # check date_trunc query P @@ -2271,27 +2271,27 @@ SELECT date_trunc('day', time) FROM foo query T SELECT arrow_typeof(date_trunc('day', time)) FROM foo LIMIT 1 ---- -Timestamp(Nanosecond, Some("+05:00")) +Timestamp(ns, "+05:00") query T select arrow_typeof(date_trunc('minute', to_timestamp_seconds(61))) ---- -Timestamp(Second, None) +Timestamp(s) query T select arrow_typeof(date_trunc('second', to_timestamp_millis(61))) ---- -Timestamp(Millisecond, None) +Timestamp(ms) query T select arrow_typeof(date_trunc('millisecond', to_timestamp_micros(61))) ---- -Timestamp(Microsecond, None) +Timestamp(µs) query T select arrow_typeof(date_trunc('microsecond', to_timestamp(61))) ---- -Timestamp(Nanosecond, None) +Timestamp(ns) # check date_bin query P @@ -2306,7 +2306,7 @@ SELECT date_bin(INTERVAL '1 day', time, '1970-01-01T00:00:00+05:00') FROM foo query T SELECT arrow_typeof(date_bin(INTERVAL '1 day', time, '1970-01-01T00:00:00+05:00')) FROM foo LIMIT 1 ---- -Timestamp(Nanosecond, Some("+05:00")) +Timestamp(ns, "+05:00") # timestamp comparison with and without timezone @@ -2348,7 +2348,7 @@ true true true true true true true true true true true true true query TTT SELECT arrow_typeof(to_timestamp(1)), arrow_typeof(to_timestamp(null)), arrow_typeof(to_timestamp('2023-01-10 12:34:56.000')) ---- -Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) +Timestamp(ns) Timestamp(ns) Timestamp(ns) # verify timestamp output types using timestamp literal syntax query BBBBBB @@ -2384,7 +2384,7 @@ NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01 1969-12-31T23:5 query TTT SELECT arrow_typeof(to_timestamp(1, '%c', '%s')), arrow_typeof(to_timestamp(null, '%+', '%s')), arrow_typeof(to_timestamp('2023-01-10 12:34:56.000', '%Y-%m-%d %H:%M:%S%.f')) ---- -Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) Timestamp(Nanosecond, None) +Timestamp(ns) Timestamp(ns) Timestamp(ns) # to_timestamp with invalid formatting query error input contains invalid characters @@ -2690,8 +2690,8 @@ SELECT t1.ts, t1.ts + INTERVAL '1' SECOND FROM t1; query PT SELECT t1.ts::timestamptz, arrow_typeof(t1.ts::timestamptz) FROM t1; ---- -2018-07-01T06:00:00Z Timestamp(Nanosecond, Some("+00")) -2018-07-01T07:00:00Z Timestamp(Nanosecond, Some("+00")) +2018-07-01T06:00:00Z Timestamp(ns, "+00") +2018-07-01T07:00:00Z Timestamp(ns, "+00") query D SELECT 0::TIME @@ -3281,7 +3281,7 @@ from ( select '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' as time ); ---- -2024-04-01T00:00:20+02:00 Timestamp(Nanosecond, Some("Europe/Brussels")) 2024-04-01T00:00:20 Timestamp(Nanosecond, None) +2024-04-01T00:00:20+02:00 Timestamp(ns, "Europe/Brussels") 2024-04-01T00:00:20 Timestamp(ns) # use to_local_time() in date_bin() query P @@ -3326,53 +3326,53 @@ from t; query PPT select column1, to_local_time(column1::timestamp), arrow_typeof(to_local_time(column1::timestamp)) from t_utc; ---- -NULL NULL Timestamp(Nanosecond, None) -2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(Nanosecond, None) -2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(Nanosecond, None) -2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(Nanosecond, None) -2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(Nanosecond, None) -2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(Nanosecond, None) -2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(Nanosecond, None) -2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(Nanosecond, None) -2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(Nanosecond, None) -2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(Nanosecond, None) -2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(Nanosecond, None) -2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(Nanosecond, None) -2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(Nanosecond, None) +NULL NULL Timestamp(ns) +2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(ns) +2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(ns) +2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(ns) +2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(ns) +2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(ns) +2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(ns) +2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(ns) +2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(ns) +2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(ns) +2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(ns) +2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(ns) +2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(ns) query PPT select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_utc; ---- -NULL NULL Timestamp(Nanosecond, None) -2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(Nanosecond, None) -2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(Nanosecond, None) -2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(Nanosecond, None) -2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(Nanosecond, None) -2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(Nanosecond, None) -2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(Nanosecond, None) -2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(Nanosecond, None) -2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(Nanosecond, None) -2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(Nanosecond, None) -2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(Nanosecond, None) -2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(Nanosecond, None) -2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(Nanosecond, None) +NULL NULL Timestamp(ns) +2024-01-01T00:00:01Z 2024-01-01T00:00:01 Timestamp(ns) +2024-02-01T00:00:01Z 2024-02-01T00:00:01 Timestamp(ns) +2024-03-01T00:00:01Z 2024-03-01T00:00:01 Timestamp(ns) +2024-04-01T00:00:01Z 2024-04-01T00:00:01 Timestamp(ns) +2024-05-01T00:00:01Z 2024-05-01T00:00:01 Timestamp(ns) +2024-06-01T00:00:01Z 2024-06-01T00:00:01 Timestamp(ns) +2024-07-01T00:00:01Z 2024-07-01T00:00:01 Timestamp(ns) +2024-08-01T00:00:01Z 2024-08-01T00:00:01 Timestamp(ns) +2024-09-01T00:00:01Z 2024-09-01T00:00:01 Timestamp(ns) +2024-10-01T00:00:01Z 2024-10-01T00:00:01 Timestamp(ns) +2024-11-01T00:00:01Z 2024-11-01T00:00:01 Timestamp(ns) +2024-12-01T00:00:01Z 2024-12-01T00:00:01 Timestamp(ns) query PPT select column1, to_local_time(column1), arrow_typeof(to_local_time(column1)) from t_timezone; ---- -NULL NULL Timestamp(Nanosecond, None) -2024-01-01T00:00:01+01:00 2024-01-01T00:00:01 Timestamp(Nanosecond, None) -2024-02-01T00:00:01+01:00 2024-02-01T00:00:01 Timestamp(Nanosecond, None) -2024-03-01T00:00:01+01:00 2024-03-01T00:00:01 Timestamp(Nanosecond, None) -2024-04-01T00:00:01+02:00 2024-04-01T00:00:01 Timestamp(Nanosecond, None) -2024-05-01T00:00:01+02:00 2024-05-01T00:00:01 Timestamp(Nanosecond, None) -2024-06-01T00:00:01+02:00 2024-06-01T00:00:01 Timestamp(Nanosecond, None) -2024-07-01T00:00:01+02:00 2024-07-01T00:00:01 Timestamp(Nanosecond, None) -2024-08-01T00:00:01+02:00 2024-08-01T00:00:01 Timestamp(Nanosecond, None) -2024-09-01T00:00:01+02:00 2024-09-01T00:00:01 Timestamp(Nanosecond, None) -2024-10-01T00:00:01+02:00 2024-10-01T00:00:01 Timestamp(Nanosecond, None) -2024-11-01T00:00:01+01:00 2024-11-01T00:00:01 Timestamp(Nanosecond, None) -2024-12-01T00:00:01+01:00 2024-12-01T00:00:01 Timestamp(Nanosecond, None) +NULL NULL Timestamp(ns) +2024-01-01T00:00:01+01:00 2024-01-01T00:00:01 Timestamp(ns) +2024-02-01T00:00:01+01:00 2024-02-01T00:00:01 Timestamp(ns) +2024-03-01T00:00:01+01:00 2024-03-01T00:00:01 Timestamp(ns) +2024-04-01T00:00:01+02:00 2024-04-01T00:00:01 Timestamp(ns) +2024-05-01T00:00:01+02:00 2024-05-01T00:00:01 Timestamp(ns) +2024-06-01T00:00:01+02:00 2024-06-01T00:00:01 Timestamp(ns) +2024-07-01T00:00:01+02:00 2024-07-01T00:00:01 Timestamp(ns) +2024-08-01T00:00:01+02:00 2024-08-01T00:00:01 Timestamp(ns) +2024-09-01T00:00:01+02:00 2024-09-01T00:00:01 Timestamp(ns) +2024-10-01T00:00:01+02:00 2024-10-01T00:00:01 Timestamp(ns) +2024-11-01T00:00:01+01:00 2024-11-01T00:00:01 Timestamp(ns) +2024-12-01T00:00:01+01:00 2024-12-01T00:00:01 Timestamp(ns) # combine to_local_time() with date_bin() query P @@ -3667,7 +3667,7 @@ SELECT arrow_cast(a, 'LargeUtf8') FROM (SELECT TIMESTAMP '2005-09-10 13:31:00' AS a) ---- -Timestamp(Nanosecond, None) 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 +Timestamp(ns) 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 2005-09-10T13:31:00 query TTTTT SELECT @@ -3678,4 +3678,4 @@ SELECT arrow_cast(a, 'LargeUtf8') FROM (SELECT CAST('2005-09-10 13:31:00 +02:00' AS timestamp with time zone) AS a) ---- -Timestamp(Nanosecond, Some("+00")) 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z +Timestamp(ns, "+00") 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z 2005-09-10T11:31:00Z diff --git a/datafusion/sqllogictest/test_files/type_coercion.slt b/datafusion/sqllogictest/test_files/type_coercion.slt index 3175a0646b79..e3baa8fedcf6 100644 --- a/datafusion/sqllogictest/test_files/type_coercion.slt +++ b/datafusion/sqllogictest/test_files/type_coercion.slt @@ -47,7 +47,7 @@ query error DataFusion error: Error during planning: Cannot coerce arithmetic ex select interval '1 month' - '2023-05-01'::date; # interval - timestamp -query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(Nanosecond, None\) to valid types +query error DataFusion error: Error during planning: Cannot coerce arithmetic expression Interval\(MonthDayNano\) \- Timestamp\(ns\) to valid types SELECT interval '1 month' - '2023-05-01 12:30:00'::timestamp; # dictionary(int32, utf8) -> utf8 diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index 1f7605d220c5..75db459b1881 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -521,7 +521,7 @@ physical_plan 16)----ProjectionExec: expr=[1 as cnt] 17)------PlaceholderRowExec 18)----ProjectionExec: expr=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as cnt] -19)------BoundedWindowAggExec: wdw=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +19)------BoundedWindowAggExec: wdw=[lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "lead(b.c1,Int64(1)) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 20)--------ProjectionExec: expr=[1 as c1] 21)----------PlaceholderRowExec diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 38fcc1ba9016..50121813133b 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -863,11 +863,11 @@ select count(*) from (select unnest(range(0, 100000)) id) t inner join (select u # Test implicit LATERAL support for UNNEST # Issue: https://github.com/apache/datafusion/issues/13659 # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "column1", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}, Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "column1", data_type: List\(Field \{ data_type: Int64, nullable: true \}\), nullable: true \}, Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) select * from unnest_table u, unnest(u.column1); # Test implicit LATERAL support for UNNEST (INNER JOIN) -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "column1", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}, Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(Field \{ name: "column1", data_type: List\(Field \{ data_type: Int64, nullable: true \}\), nullable: true \}, Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) select * from unnest_table u INNER JOIN unnest(u.column1) AS t(column1) ON u.column3 = t.column1; # Test implicit LATERAL planning for UNNEST @@ -883,7 +883,7 @@ logical_plan 06)------Unnest: lists[__unnest_placeholder(outer_ref(u.column1))|depth=1] structs[] 07)--------Projection: outer_ref(u.column1) AS __unnest_placeholder(outer_ref(u.column1)) 08)----------EmptyRelation: rows=1 -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "column1", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "u" }), name: "column1" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "column1", data_type: List(Field { data_type: Int64, nullable: true }), nullable: true }, Column { relation: Some(Bare { table: "u" }), name: "column1" }) # Test implicit LATERAL planning for UNNEST (INNER JOIN) query TT @@ -899,7 +899,7 @@ logical_plan 07)--------Unnest: lists[__unnest_placeholder(outer_ref(u.column1))|depth=1] structs[] 08)----------Projection: outer_ref(u.column1) AS __unnest_placeholder(outer_ref(u.column1)) 09)------------EmptyRelation: rows=1 -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "column1", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Column { relation: Some(Bare { table: "u" }), name: "column1" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(Field { name: "column1", data_type: List(Field { data_type: Int64, nullable: true }), nullable: true }, Column { relation: Some(Bare { table: "u" }), name: "column1" }) # uncorrelated EXISTS with unnest query I @@ -969,7 +969,7 @@ physical_plan 08)--------------UnnestExec 09)----------------ProjectionExec: expr=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as generated_id, make_array(value@0) as __unnest_placeholder(make_array(range().value))] 10)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -11)--------------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +11)--------------------BoundedWindowAggExec: wdw=[row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 12)----------------------LazyMemoryExec: partitions=1, batch_generators=[range: start=1, end=5, batch_size=8192] # Unnest array where data is already ordered by column2 (100, 200, 300, 400) diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index f1a708d84dd3..d9b4a818f99e 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -360,7 +360,7 @@ physical_plan 02)--ProjectionExec: expr=[b@0 as b, max(d.a)@1 as max_a, max(d.seq)@2 as max(d.seq)] 03)----AggregateExec: mode=SinglePartitioned, gby=[b@2 as b], aggr=[max(d.a), max(d.seq)], ordering_mode=Sorted 04)------ProjectionExec: expr=[row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as seq, a@0 as a, b@1 as b] -05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() PARTITION BY [s.b] ORDER BY [s.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[b@1 ASC NULLS LAST, a@0 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=8192 08)--------------RepartitionExec: partitioning=Hash([b@1], 4), input_partitions=4 @@ -1241,9 +1241,9 @@ logical_plan 05)--------TableScan: aggregate_test_100 projection=[c8, c9] physical_plan 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as sum2] -02)--BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----ProjectionExec: expr=[c9@1 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c9@1 ASC NULLS LAST, c8@0 ASC NULLS LAST], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c8, c9], file_type=csv, has_header=true @@ -1262,9 +1262,9 @@ logical_plan 05)--------TableScan: aggregate_test_100 projection=[c2, c9] physical_plan 01)ProjectionExec: expr=[c2@0 as c2, max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -02)--WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] -03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c2@0 ASC NULLS LAST, c9@1 ASC NULLS LAST], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c9], file_type=csv, has_header=true @@ -1286,10 +1286,10 @@ logical_plan physical_plan 01)SortExec: expr=[c2@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[c2@0 as c2, max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] -04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c9@1 ASC NULLS LAST, c2@0 ASC NULLS LAST], preserve_partitioning=[false] -06)----------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c2 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 07)------------SortExec: expr=[c2@0 ASC NULLS LAST, c9@1 ASC NULLS LAST], preserve_partitioning=[false] 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c9], file_type=csv, has_header=true @@ -1311,12 +1311,12 @@ logical_plan 05)--------TableScan: aggregate_test_100 projection=[c1, c2, c4] physical_plan 01)ProjectionExec: expr=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@2 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING] -02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 03)----SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 04)------CoalesceBatchesExec: target_batch_size=4096 05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 06)----------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING] -07)------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 08)--------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 09)----------------CoalesceBatchesExec: target_batch_size=4096 10)------------------RepartitionExec: partitioning=Hash([c1@0, c2@1], 2), input_partitions=2 @@ -1343,8 +1343,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as sum2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 05)--------SortExec: TopK(fetch=10), expr=[c9@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -1386,8 +1386,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as sum2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 05)--------SortExec: TopK(fetch=10), expr=[c9@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -1446,8 +1446,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as sum2] 02)--GlobalLimitExec: skip=5, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 05)--------SortExec: TopK(fetch=15), expr=[c9@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -1488,8 +1488,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as fv1, first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as fv2, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as lag1, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@2 as lag2, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as lead1, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as lead2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "first_value(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lag(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lead(aggregate_test_100.c9,Int64(2),Int64(10101)) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 05)--------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -1531,9 +1531,9 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as rn1, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@1 as rn2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 04)------SortExec: TopK(fetch=10), expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false] -05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -1573,10 +1573,10 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as rn2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 04)------SortExec: TopK(fetch=10), expr=[c9@2 ASC NULLS LAST, c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[false] -05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] -06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c1 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 07)------------SortExec: expr=[c9@2 DESC, c1@0 DESC], preserve_partitioning=[false] 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c9], file_type=csv, has_header=true @@ -1655,19 +1655,19 @@ logical_plan physical_plan 01)ProjectionExec: expr=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@18 as a, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@18 as b, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@3 as c, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@11 as d, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@7 as e, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@3 as f, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@11 as g, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@19 as h, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as i, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@12 as j, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as k, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@8 as l, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@17 as m, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@15 as n, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as o, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@16 as p, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@20 as a1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@20 as b1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@5 as c1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@13 as d1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@9 as e1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@5 as f1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@13 as g1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@19 as h1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@19 as j1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as k1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@12 as l1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@8 as m1, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as n1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@12 as o1, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@21 as h11, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@21 as j11, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@6 as k11, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@14 as l11, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@10 as m11, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@6 as n11, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@14 as o11] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] 04)------ProjectionExec: expr=[c1@0 as c1, c3@2 as c3, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@4 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@6 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@7 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@8 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@9 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@10 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@11 as sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING@12 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@13 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING@14 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING@15 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@16 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@17 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@18 as sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -05)--------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[c3@2 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[false] -07)------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 08)--------------SortExec: expr=[c3@2 ASC NULLS LAST, c1@0 ASC], preserve_partitioning=[false] -09)----------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +09)----------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS LAST, null_cases.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 10)------------------SortExec: expr=[c3@2 ASC NULLS LAST, c1@0 DESC], preserve_partitioning=[false] -11)--------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(10)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }] -12)----------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] +11)--------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(10)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(11)), end_bound: Following(Int64(NULL)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 ASC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }] +12)----------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS LAST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] 13)------------------------SortExec: expr=[c3@2 DESC NULLS LAST], preserve_partitioning=[false] -14)--------------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] -15)----------------------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +14)--------------------------WindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(10)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: CurrentRow, is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND 11 FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: Preceding(Int64(NULL)), end_bound: Following(Int64(11)), is_causal: false }, sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST] RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int64(NULL)), is_causal: false }] +15)----------------------------BoundedWindowAggExec: wdw=[sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(null_cases.c1) ORDER BY [null_cases.c3 DESC NULLS FIRST, null_cases.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 16)------------------------------SortExec: expr=[c3@2 DESC, c1@0 ASC NULLS LAST], preserve_partitioning=[false] 17)--------------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/null_cases.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true @@ -1741,8 +1741,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@1 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 05)--------SortExec: TopK(fetch=10), expr=[c1@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], file_type=csv, has_header=true @@ -1785,8 +1785,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@1 as c9, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@2 as sum2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 05)--------SortExec: TopK(fetch=10), expr=[c1@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c9], file_type=csv, has_header=true @@ -1831,9 +1831,9 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c3@1 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST, aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }] 04)------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, c3@2 as c3, c9@3 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortPreservingMergeExec: [__common_expr_1@0 DESC, c9@3 DESC, c2@1 ASC NULLS LAST] 07)------------SortExec: expr=[__common_expr_1@0 DESC, c9@3 DESC, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 08)--------------ProjectionExec: expr=[c3@1 + c4@2 as __common_expr_1, c2@0 as c2, c3@1 as c3, c9@3 as c9] @@ -1926,13 +1926,13 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c3@0 ASC NULLS LAST], fetch=5 02)--ProjectionExec: expr=[c3@0 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum2] -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c3] ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c3@0 ASC NULLS LAST, c9@1 DESC], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=4096 06)----------RepartitionExec: partitioning=Hash([c3@0], 2), input_partitions=2 07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 08)--------------ProjectionExec: expr=[c3@1 as c3, c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -09)----------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +09)----------------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 DESC NULLS FIRST, aggregate_test_100.c9 DESC NULLS FIRST, aggregate_test_100.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 10)------------------SortExec: expr=[c3@1 DESC, c9@2 DESC, c2@0 ASC NULLS LAST], preserve_partitioning=[false] 11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3, c9], file_type=csv, has_header=true @@ -1968,7 +1968,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST] 02)--ProjectionExec: expr=[c1@0 as c1, row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1] -03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 04)------SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=4096 06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -2097,7 +2097,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, rn1@1 ASC NULLS LAST] 02)--ProjectionExec: expr=[c1@0 as c1, row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as rn1] -03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [aggregate_test_100.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 04)------SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=4096 06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -2123,10 +2123,10 @@ logical_plan physical_plan 01)SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[c1@0 as c1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING@2 as sum1, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@3 as sum2] -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 04)------SortPreservingMergeExec: [c9@1 ASC NULLS LAST] 05)--------SortExec: expr=[c9@1 ASC NULLS LAST], preserve_partitioning=[true] -06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING], mode=[Sorted] 07)------------SortExec: expr=[c1@0 ASC NULLS LAST, c9@1 ASC NULLS LAST], preserve_partitioning=[true] 08)--------------CoalesceBatchesExec: target_batch_size=4096 09)----------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -2211,11 +2211,11 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[c9@2 as c9, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as sum1, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@6 as sum2, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum3, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@5 as sum4] -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 04)------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c9@3 as c9, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@6 as sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING] -05)--------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] -06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] -07)------------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] +05)--------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c2, aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] +06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +07)------------WindowAggExec: wdw=[sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100.c9) PARTITION BY [aggregate_test_100.c1, aggregate_test_100.c2] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST, aggregate_test_100.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] 08)--------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, c9@3 ASC NULLS LAST, c8@2 ASC NULLS LAST], preserve_partitioning=[false] 09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c8, c9], file_type=csv, has_header=true @@ -2266,12 +2266,12 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@1 as c9, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as sum1, sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@6 as sum2, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum3, sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@5 as sum4] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 04)------ProjectionExec: expr=[c2@0 as c2, c9@2 as c9, c1_alias@3 as c1_alias, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@4 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@5 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@6 as sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING] -05)--------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] +05)--------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c2, t1.c1_alias] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] 06)----------ProjectionExec: expr=[c2@1 as c2, c8@2 as c8, c9@3 as c9, c1_alias@4 as c1_alias, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING@5 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING, sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@6 as sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING] -07)------------BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] -08)--------------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] +07)------------BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +08)--------------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] 09)----------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, c9@3 ASC NULLS LAST, c8@2 ASC NULLS LAST], preserve_partitioning=[false] 10)------------------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c8@2 as c8, c9@3 as c9, c1@0 as c1_alias] 11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c8, c9], file_type=csv, has_header=true @@ -2312,9 +2312,9 @@ physical_plan 01)ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2] 02)--SortExec: TopK(fetch=5), expr=[c9@2 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING@3 as sum1, sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING@4 as sum2, c9@1 as c9] -04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING: Field { name: "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING: Field { "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST] GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING": nullable Float64 }, frame: GROUPS BETWEEN 5 PRECEDING AND 3 PRECEDING], mode=[Sorted] 05)--------ProjectionExec: expr=[c1@0 as c1, c9@2 as c9, c12@3 as c12, sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING@4 as sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING] -06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { name: "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING: Field { "sum(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST] GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING": nullable Float64 }, frame: GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 07)------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[false] 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c9, c12], file_type=csv, has_header=true @@ -2348,7 +2348,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2385,7 +2385,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2422,7 +2422,7 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[rn1@1 DESC], preserve_partitioning=[false] 02)--ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2462,7 +2462,7 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[rn1@1 ASC NULLS LAST, c9@0 ASC NULLS LAST], preserve_partitioning=[false], sort_prefix=[rn1@1 ASC NULLS LAST] 02)--ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2537,7 +2537,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2559,7 +2559,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c5@0 as c5, c9@1 as c9, row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rn1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 + aggregate_test_100.c5 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[CAST(c9@1 AS Decimal128(20, 0)) + CAST(c5@0 AS Decimal128(20, 0)) DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5, c9], file_type=csv, has_header=true @@ -2580,7 +2580,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, CAST(row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 AS Int64) as rn1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -2685,10 +2685,10 @@ physical_plan 01)ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2, sum3@2 as sum3, min1@3 as min1, min2@4 as min2, min3@5 as min3, max1@6 as max1, max2@7 as max2, max3@8 as max3, cnt1@9 as cnt1, cnt2@10 as cnt2, sumr1@11 as sumr1, sumr2@12 as sumr2, sumr3@13 as sumr3, minr1@14 as minr1, minr2@15 as minr2, minr3@16 as minr3, maxr1@17 as maxr1, maxr2@18 as maxr2, maxr3@19 as maxr3, cntr1@20 as cntr1, cntr2@21 as cntr2, sum4@22 as sum4, cnt3@23 as cnt3] 02)--SortExec: TopK(fetch=5), expr=[inc_col@24 DESC], preserve_partitioning=[false] 03)----ProjectionExec: expr=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as sum1, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@14 as sum2, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@15 as sum3, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@16 as min1, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@17 as min2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as min3, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as max1, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@20 as max2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@21 as max3, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING@22 as cnt1, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@23 as cnt2, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING@2 as sumr1, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING@3 as sumr2, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@4 as sumr3, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as minr1, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@6 as minr2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@7 as minr3, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@8 as maxr1, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@9 as maxr2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@10 as maxr3, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@11 as cntr1, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@12 as cntr2, sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@24 as sum4, count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@25 as cnt3, inc_col@1 as inc_col] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 05)--------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, inc_col@3 as inc_col, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING@5 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING@6 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@7 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@8 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@9 as min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@10 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@12 as max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@13 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@14 as count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@15 as count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@16 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@17 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@20 as min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@21 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@22 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@23 as max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING@25 as count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@26 as count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING] -06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { name: "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { name: "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING: Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 4 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING: Field { name: "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { name: "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { name: "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 2 PRECEDING AND 6 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { name: "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 8 FOLLOWING], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING": Int64 }, frame: RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 4 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 2 PRECEDING AND 6 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 8 FOLLOWING], mode=[Sorted] 08)--------------ProjectionExec: expr=[CAST(desc_col@2 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Int64) as __common_expr_2, ts@0 as ts, inc_col@1 as inc_col, desc_col@2 as desc_col] 09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col, desc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true @@ -2771,8 +2771,8 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[ts@0 DESC], preserve_partitioning=[false] 02)--ProjectionExec: expr=[ts@0 as ts, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@10 as fv1, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as fv2, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@12 as lv1, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as lv2, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@14 as nv1, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@15 as nv2, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@16 as rn1, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@17 as rn2, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as rank1, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as rank2, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@20 as dense_rank1, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@21 as dense_rank2, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@22 as lag1, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@23 as lag2, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as lead1, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@25 as lead2, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@2 as fvr1, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as fvr2, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@4 as lvr1, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as lvr2, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@6 as lagr1, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@7 as lagr2, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@8 as leadr1, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@9 as leadr2] -03)----BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { name: "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { name: "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "nth_value(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": UInt64 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "row_number() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": UInt64 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": UInt64 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "dense_rank() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": UInt64 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lag(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "lead(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING], mode=[Sorted] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true query IIIIIIIIIIIIIIIIIIIIIIIII @@ -2843,8 +2843,8 @@ physical_plan 01)ProjectionExec: expr=[sum1@0 as sum1, sum2@1 as sum2, min1@2 as min1, min2@3 as min2, max1@4 as max1, max2@5 as max2, count1@6 as count1, count2@7 as count2, avg1@8 as avg1, avg2@9 as avg2] 02)--SortExec: TopK(fetch=5), expr=[inc_col@10 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@9 as sum1, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as sum2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@10 as min1, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@5 as min2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@11 as max1, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@6 as max2, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@12 as count1, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@7 as count2, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@13 as avg1, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@8 as avg2, inc_col@3 as inc_col] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { name: "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { name: "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING], mode=[Sorted] -05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] 06)----------ProjectionExec: expr=[CAST(inc_col@1 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Float64) as __common_expr_2, ts@0 as ts, inc_col@1 as inc_col] 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true @@ -2895,8 +2895,8 @@ physical_plan 01)ProjectionExec: expr=[first_value1@0 as first_value1, first_value2@1 as first_value2, last_value1@2 as last_value1, last_value2@3 as last_value2, nth_value1@4 as nth_value1] 02)--SortExec: TopK(fetch=5), expr=[inc_col@5 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@4 as first_value1, first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@2 as first_value2, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@5 as last_value1, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as last_value2, nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@6 as nth_value1, inc_col@1 as inc_col] -04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] -05)--------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "nth_value(annotated_data_finite.inc_col,Int64(2)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "first_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "last_value(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true query IIIII @@ -2939,8 +2939,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@5 as sum1, sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum2, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@6 as count1, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as count2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] 05)--------ProjectionExec: expr=[CAST(inc_col@1 AS Int64) as __common_expr_1, ts@0 as ts, inc_col@1 as inc_col] 06)----------StreamingTableExec: partition_sizes=1, projection=[ts, inc_col], infinite_source=true, output_ordering=[ts@0 ASC NULLS LAST] @@ -2984,8 +2984,8 @@ logical_plan physical_plan 01)ProjectionExec: expr=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@5 as sum1, sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@3 as sum2, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING@6 as count1, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as count2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "sum(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "count(annotated_data_infinite.inc_col) ORDER BY [annotated_data_infinite.ts DESC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] 05)--------ProjectionExec: expr=[CAST(inc_col@1 AS Int64) as __common_expr_1, ts@0 as ts, inc_col@1 as inc_col] 06)----------StreamingTableExec: partition_sizes=1, projection=[ts, inc_col], infinite_source=true, output_ordering=[ts@0 ASC NULLS LAST] @@ -3084,12 +3084,12 @@ logical_plan physical_plan 01)ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as sum1, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as sum2, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@15 as sum3, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING@16 as sum4, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as sum5, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as sum6, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as sum7, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as sum8, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as sum9, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as sum10, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@13 as sum11, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING@14 as sum12] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING], mode=[Linear] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[PartiallySorted([1, 0])] -05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] -06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING], mode=[PartiallySorted([0])] -07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0, 1])] -08)--------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING], mode=[Linear] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[PartiallySorted([1, 0])] +05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING], mode=[PartiallySorted([0])] +07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0, 1])] +08)--------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 09)----------------ProjectionExec: expr=[CAST(c@2 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d] 10)------------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST] @@ -3152,17 +3152,17 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[c@2 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as sum1, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as sum2, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@15 as sum3, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING@16 as sum4, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as sum5, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as sum6, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as sum7, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as sum8, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as sum9, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as sum10, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@13 as sum11, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING@14 as sum12] -03)----BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING], mode=[Sorted] 04)------SortExec: expr=[d@4 ASC NULLS LAST, a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false] -05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING], mode=[Sorted] 06)----------SortExec: expr=[b@2 ASC NULLS LAST, a@1 ASC NULLS LAST, d@4 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false] -07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 08)--------------SortExec: expr=[b@2 ASC NULLS LAST, a@1 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false] -09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING], mode=[Sorted] +09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING], mode=[Sorted] 10)------------------SortExec: expr=[a@1 ASC NULLS LAST, d@4 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false] -11)--------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted] +11)--------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted] 12)----------------------SortExec: expr=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, d@4 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false] -13)------------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { name: "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] +13)------------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 14)--------------------------ProjectionExec: expr=[CAST(c@2 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d] 15)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true @@ -3226,7 +3226,7 @@ physical_plan 01)ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as rn1] 02)--CoalesceBatchesExec: target_batch_size=4096, fetch=5 03)----FilterExec: row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 < 50 -04)------BoundedWindowAggExec: wdw=[row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST] # Top level sort is pushed down through BoundedWindowAggExec as its SUM result does already satisfy the required @@ -3248,7 +3248,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c9@0 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as sum1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c9@0 DESC], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -3333,11 +3333,11 @@ logical_plan 08)--------------TableScan: annotated_data_infinite2 projection=[a, b, c, d] physical_plan 01)ProjectionExec: expr=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum2, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum3, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum4] -02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Linear] +02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Linear] 03)----ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, a@1 as a, d@4 as d, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] -05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])] -06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])] +06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 07)------------ProjectionExec: expr=[CAST(a@0 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d] 08)--------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST] @@ -3364,17 +3364,17 @@ logical_plan 08)--------------TableScan: annotated_data_infinite2 projection=[a, b, c, d] physical_plan 01)ProjectionExec: expr=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum2, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum3, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum4] -02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Linear] +02)--BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Linear] 03)----CoalesceBatchesExec: target_batch_size=4096 04)------RepartitionExec: partitioning=Hash([d@2], 2), input_partitions=2, preserve_order=true, sort_exprs=__common_expr_1@0 ASC NULLS LAST, a@1 ASC NULLS LAST 05)--------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, a@1 as a, d@4 as d, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 07)------------CoalesceBatchesExec: target_batch_size=4096 08)--------------RepartitionExec: partitioning=Hash([b@2, a@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST -09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])] +09)----------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[PartiallySorted([0])] 10)------------------CoalesceBatchesExec: target_batch_size=4096 11)--------------------RepartitionExec: partitioning=Hash([a@1, d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST -12)----------------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +12)----------------------BoundedWindowAggExec: wdw=[sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 13)------------------------CoalesceBatchesExec: target_batch_size=4096 14)--------------------------RepartitionExec: partitioning=Hash([a@1, b@2], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST, __common_expr_1@0 ASC NULLS LAST 15)----------------------------ProjectionExec: expr=[CAST(a@0 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d] @@ -3433,10 +3433,10 @@ logical_plan physical_plan 01)SortExec: TopK(fetch=5), expr=[c3@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[c3@0 as c3, max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as min1, min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as max1] -03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c12@1 ASC NULLS LAST], preserve_partitioning=[false] 05)--------ProjectionExec: expr=[c3@0 as c3, c12@2 as c12, min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] -06)----------WindowAggExec: wdw=[min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +06)----------WindowAggExec: wdw=[min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 07)------------SortExec: expr=[c11@1 ASC NULLS LAST], preserve_partitioning=[false] 08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c11, c12], file_type=csv, has_header=true @@ -3477,7 +3477,7 @@ physical_plan 01)ProjectionExec: expr=[min1@0 as min1, max1@1 as max1] 02)--SortExec: TopK(fetch=5), expr=[c3@2 ASC NULLS LAST], preserve_partitioning=[false] 03)----ProjectionExec: expr=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as min1, min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as max1, c3@0 as c3] -04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c12@1 ASC NULLS LAST], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c12], file_type=csv, has_header=true @@ -3529,7 +3529,7 @@ logical_plan 02)--Filter: multiple_ordered_table.b = Int32(0) 03)----TableScan: multiple_ordered_table projection=[a0, a, b, c, d], partial_filters=[multiple_ordered_table.b = Int32(0)] physical_plan -01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 02)--CoalesceBatchesExec: target_batch_size=4096 03)----FilterExec: b@2 = 0 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]], file_type=csv, has_header=true @@ -3547,7 +3547,7 @@ logical_plan 02)--Filter: multiple_ordered_table.b = Int32(0) 03)----TableScan: multiple_ordered_table projection=[a0, a, b, c, d], partial_filters=[multiple_ordered_table.b = Int32(0)] physical_plan -01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +01)BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.a) ORDER BY [multiple_ordered_table.b ASC NULLS LAST, multiple_ordered_table.d ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 02)--SortExec: expr=[d@4 ASC NULLS LAST], preserve_partitioning=[false] 03)----CoalesceBatchesExec: target_batch_size=4096 04)------FilterExec: b@2 = 0 @@ -3584,9 +3584,9 @@ logical_plan 05)--------TableScan: multiple_ordered_table projection=[a, b, c, d] physical_plan 01)ProjectionExec: expr=[min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as min1, max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as max1] -02)--BoundedWindowAggExec: wdw=[min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(multiple_ordered_table.d) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----ProjectionExec: expr=[c@2 as c, d@3 as d, max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -04)------BoundedWindowAggExec: wdw=[max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.b, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_orderings=[[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], [c@2 ASC NULLS LAST]], file_type=csv, has_header=true query TT @@ -3603,7 +3603,7 @@ logical_plan 04)------TableScan: multiple_ordered_table projection=[c, d], partial_filters=[multiple_ordered_table.d = Int32(0)] physical_plan 01)ProjectionExec: expr=[max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as max_c] -02)--BoundedWindowAggExec: wdw=[max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(multiple_ordered_table.c) PARTITION BY [multiple_ordered_table.d] ORDER BY [multiple_ordered_table.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----CoalesceBatchesExec: target_batch_size=4096 04)------FilterExec: d@1 = 0 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c, d], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true @@ -3618,7 +3618,7 @@ logical_plan 03)----TableScan: multiple_ordered_table projection=[a, c, d] physical_plan 01)ProjectionExec: expr=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c] ORDER BY [multiple_ordered_table.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c, d], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], file_type=csv, has_header=true query TT @@ -3631,7 +3631,7 @@ logical_plan 03)----TableScan: multiple_ordered_table projection=[a, b, c, d] physical_plan 01)ProjectionExec: expr=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(multiple_ordered_table.d) PARTITION BY [multiple_ordered_table.c, multiple_ordered_table.a] ORDER BY [multiple_ordered_table.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_orderings=[[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], [c@2 ASC NULLS LAST]], file_type=csv, has_header=true query I @@ -3673,7 +3673,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c@0 as c, nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as nv1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----WindowAggExec: wdw=[nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int32(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "nth_value(multiple_ordered_table.c,Int64(2)) ORDER BY [multiple_ordered_table.c DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int32(NULL)), is_causal: false }] 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], file_type=csv, has_header=true query II @@ -3724,7 +3724,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c@3 ASC NULLS LAST] 02)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW@5 as avg_d] -03)----BoundedWindowAggExec: wdw=[avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW: Field { name: "avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN 10 PRECEDING AND CURRENT ROW], mode=[Linear] +03)----BoundedWindowAggExec: wdw=[avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW: Field { "avg(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW": nullable Float64 }, frame: RANGE BETWEEN 10 PRECEDING AND CURRENT ROW], mode=[Linear] 04)------CoalesceBatchesExec: target_batch_size=4096 05)--------RepartitionExec: partitioning=Hash([d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST 06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -4059,7 +4059,7 @@ logical_plan 03)----TableScan: table_with_pk projection=[sn, ts, currency, amount] physical_plan 01)ProjectionExec: expr=[sn@0 as sn, ts@1 as ts, currency@2 as currency, amount@3 as amount, sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum1] -02)--BoundedWindowAggExec: wdw=[sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(table_with_pk.amount) ORDER BY [table_with_pk.sn ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----SortExec: expr=[sn@0 ASC NULLS LAST], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] @@ -4178,9 +4178,9 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c3@0 as c3, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum2, sum1@3 as sum1] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(Int16(NULL)), is_causal: false }] 04)------ProjectionExec: expr=[c3@0 as c3, c4@1 as c4, c9@2 as c9, sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum1] -05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(aggregate_test_100.c9) ORDER BY [aggregate_test_100.c3 + aggregate_test_100.c4 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[c3@0 + c4@1 DESC], preserve_partitioning=[false] 07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c4, c9], file_type=csv, has_header=true @@ -4219,7 +4219,7 @@ logical_plan 04)------TableScan: a projection=[a] physical_plan 01)ProjectionExec: expr=[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as count(*) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] -02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "count(Int64(1)) PARTITION BY [a.a] ORDER BY [a.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 03)----CoalesceBatchesExec: target_batch_size=4096 04)------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -4242,7 +4242,7 @@ logical_plan 04)------TableScan: a projection=[a] physical_plan 01)ProjectionExec: expr=[row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] -02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { name: "row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] +02)--BoundedWindowAggExec: wdw=[row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Field { "row_number() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING], mode=[Sorted] 03)----CoalesceBatchesExec: target_batch_size=4096 04)------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -5311,7 +5311,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank@2 ASC NULLS LAST] 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank] -03)----BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=1 06)----------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5355,7 +5355,7 @@ physical_plan 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank] 03)----CoalesceBatchesExec: target_batch_size=1 04)------FilterExec: c2@1 >= 10 -05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=1 08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5397,7 +5397,7 @@ physical_plan 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank] 03)----CoalesceBatchesExec: target_batch_size=1 04)------FilterExec: c2@1 = 10 -05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[c2@1 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=1 08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5438,7 +5438,7 @@ physical_plan 02)--ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank] 03)----CoalesceBatchesExec: target_batch_size=1 04)------FilterExec: c1@0 = 1 OR c2@1 = 10 -05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +05)--------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 06)----------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 07)------------CoalesceBatchesExec: target_batch_size=1 08)--------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5481,11 +5481,11 @@ physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST] 02)--SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST], preserve_partitioning=[true] 03)----ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank1, rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rank2] -04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c2@1 ASC NULLS LAST, c1@0 ASC NULLS LAST], preserve_partitioning=[true] 06)----------CoalesceBatchesExec: target_batch_size=1 07)------------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2 -08)--------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +08)--------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 09)----------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 10)------------------CoalesceBatchesExec: target_batch_size=1 11)--------------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5532,13 +5532,13 @@ physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST] 02)--SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, rank1@2 ASC NULLS LAST, rank2@3 ASC NULLS LAST], preserve_partitioning=[true] 03)----ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rank1, rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as rank2] -04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +04)------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c2, t1.c1] ORDER BY [t1.c1 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 05)--------SortExec: expr=[c2@1 ASC NULLS LAST, c1@0 ASC NULLS LAST], preserve_partitioning=[true] 06)----------CoalesceBatchesExec: target_batch_size=1 07)------------RepartitionExec: partitioning=Hash([c2@1, c1@0], 2), input_partitions=2 08)--------------CoalesceBatchesExec: target_batch_size=1 09)----------------FilterExec: c2@1 > 1 -10)------------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +10)------------------BoundedWindowAggExec: wdw=[rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() PARTITION BY [t1.c1] ORDER BY [t1.c2 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 11)--------------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST], preserve_partitioning=[true] 12)----------------------CoalesceBatchesExec: target_batch_size=1 13)------------------------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2 @@ -5599,7 +5599,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, sum_c9@1 ASC NULLS LAST] 02)--ProjectionExec: expr=[c1@0 as c1, sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as sum_c9] -03)----WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 04)------CoalesceBatchesExec: target_batch_size=1 05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2, preserve_order=true, sort_exprs=c1@0 ASC NULLS LAST 06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -5615,7 +5615,7 @@ logical_plan 04)------TableScan: aggregate_test_100_ordered projection=[c9] physical_plan 01)ProjectionExec: expr=[sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as sum_c9] -02)--WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +02)--WindowAggExec: wdw=[sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(aggregate_test_100_ordered.c9) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c9], file_type=csv, has_header=true @@ -5630,7 +5630,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST, min_c5@1 DESC NULLS LAST] 02)--ProjectionExec: expr=[c1@0 as c1, min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as min_c5] -03)----WindowAggExec: wdw=[min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +03)----WindowAggExec: wdw=[min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "min(aggregate_test_100_ordered.c5) PARTITION BY [aggregate_test_100_ordered.c1] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 04)------CoalesceBatchesExec: target_batch_size=1 05)--------RepartitionExec: partitioning=Hash([c1@0], 2), input_partitions=2, preserve_order=true, sort_exprs=c1@0 ASC NULLS LAST 06)----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 @@ -5646,7 +5646,7 @@ logical_plan 04)------TableScan: aggregate_test_100_ordered projection=[c5] physical_plan 01)ProjectionExec: expr=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as max_c5] -02)--WindowAggExec: wdw=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] +02)--WindowAggExec: wdw=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5], file_type=csv, has_header=true query II rowsort @@ -5829,7 +5829,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [k@0 ASC NULLS LAST, time@1 ASC NULLS LAST] 02)--ProjectionExec: expr=[k@0 as k, time@2 as time, count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@3 as normal_count, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@4 as distinct_count] -03)----BoundedWindowAggExec: wdw=[count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { name: "count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { name: "count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "count(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "count(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[k@0 ASC NULLS LAST, time@2 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=1 06)----------RepartitionExec: partitioning=Hash([k@0], 2), input_partitions=2 @@ -5892,7 +5892,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [k@0 ASC NULLS LAST, time@1 ASC NULLS LAST] 02)--ProjectionExec: expr=[k@1 as k, time@2 as time, sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@3 as sum_v, sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW@4 as sum_distinct_v] -03)----BoundedWindowAggExec: wdw=[sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { name: "sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { name: "sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "sum(table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW, sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW: Field { "sum(DISTINCT table_test_distinct_count.v) PARTITION BY [table_test_distinct_count.k] ORDER BY [table_test_distinct_count.time ASC NULLS LAST] RANGE BETWEEN 2 minutes PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 120000000000 } PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[k@1 ASC NULLS LAST, time@2 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=1 06)----------RepartitionExec: partitioning=Hash([k@1], 2), input_partitions=2 @@ -5937,7 +5937,7 @@ LIMIT 5 ---- DataFusion error: type_coercion caused by -Error during planning: Cannot infer common argument type for comparison operation Int64 >= List(Field { name: "item", data_type: Null, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +Error during planning: Cannot infer common argument type for comparison operation Int64 >= List(nullable Null) @@ -5965,7 +5965,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[c1@2 as c1, c2@3 as c2, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum1, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum2, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as count1, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@7 as array_agg1, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@8 as array_agg2] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(nullable Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(nullable Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortPreservingMergeExec: [c1@2 ASC NULLS LAST, c2@3 ASC NULLS LAST], fetch=5 05)--------SortExec: TopK(fetch=5), expr=[c1@2 ASC NULLS LAST, c2@3 ASC NULLS LAST], preserve_partitioning=[true] 06)----------ProjectionExec: expr=[__common_expr_3@0 as __common_expr_1, __common_expr_3@0 AND c2@2 < 4 AND c1@1 > 0 as __common_expr_2, c1@1 as c1, c2@2 as c2] diff --git a/datafusion/sqllogictest/test_files/window_limits.slt b/datafusion/sqllogictest/test_files/window_limits.slt index c1e680084f4b..883cd4404f4f 100644 --- a/datafusion/sqllogictest/test_files/window_limits.slt +++ b/datafusion/sqllogictest/test_files/window_limits.slt @@ -71,7 +71,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] 02)--GlobalLimitExec: skip=0, fetch=3 -03)----BoundedWindowAggExec: wdw=[lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.empno) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=4), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -108,7 +108,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] 02)--GlobalLimitExec: skip=0, fetch=3 -03)----BoundedWindowAggExec: wdw=[lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -170,7 +170,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[empno@0 as empno, lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as lead1, lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as lead3, lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as lead5] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=10), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno, salary], file_type=csv, has_header=true @@ -207,7 +207,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] 02)--GlobalLimitExec: skip=0, fetch=3 -03)----BoundedWindowAggExec: wdw=[nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "nth_value(employees.empno,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=3), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -244,7 +244,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] 02)--GlobalLimitExec: skip=0, fetch=3 -03)----BoundedWindowAggExec: wdw=[nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "nth_value(employees.empno,Int64(5)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int32 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=3), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -309,7 +309,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[depname@0 as depname, empno@1 as empno, sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as running_sum, avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as running_avg, min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as running_min, max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as running_max] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "avg(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Float64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "min(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "max(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[empno@1 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[depname, empno, salary], file_type=csv, has_header=true @@ -371,7 +371,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[empno@0 as empno, row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn, rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as rnk, dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as drnk] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "row_number() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "dense_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -433,7 +433,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[empno@0 as empno, percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as pr, cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as cd, ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as nt] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----WindowAggExec: wdw=[percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }] +03)----WindowAggExec: wdw=[percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "percent_rank() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "cume_dist() ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "ntile(Int64(4)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64 }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }] 04)------SortExec: expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno], file_type=csv, has_header=true @@ -498,7 +498,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[empno@0 as empno, first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as fv, lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as l1, last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as lv, nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as n3] 02)--GlobalLimitExec: skip=0, fetch=5 -03)----BoundedWindowAggExec: wdw=[first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "first_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lag(employees.salary,Int64(1)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "last_value(employees.salary) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "nth_value(employees.salary,Int64(3)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno, salary], file_type=csv, has_header=true @@ -541,7 +541,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [depname@0 ASC NULLS LAST], fetch=5 02)--ProjectionExec: expr=[depname@0 as depname, empno@1 as empno, salary@2 as salary, sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW@3 as running_sum] -03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { name: "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=4 @@ -587,7 +587,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [depname@0 ASC NULLS LAST], fetch=5 02)--ProjectionExec: expr=[depname@0 as depname, empno@1 as empno, salary@2 as salary, sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW@3 as running_sum] -03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { name: "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW: Field { "sum(employees.salary) PARTITION BY [employees.depname] ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[depname@0 ASC NULLS LAST, empno@1 ASC NULLS LAST], preserve_partitioning=[true] 05)--------CoalesceBatchesExec: target_batch_size=8192 06)----------RepartitionExec: partitioning=Hash([depname@0], 4), input_partitions=4 @@ -764,6 +764,6 @@ logical_plan physical_plan 01)ProjectionExec: expr=[empno@0 as empno, salary@1 as salary, lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as lead2] 02)--GlobalLimitExec: skip=0, fetch=3 -03)----BoundedWindowAggExec: wdw=[lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { name: "lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] +03)----BoundedWindowAggExec: wdw=[lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "lead(employees.salary,Int64(2)) ORDER BY [employees.empno ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable UInt64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortExec: TopK(fetch=5), expr=[empno@0 ASC NULLS LAST], preserve_partitioning=[false] 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100_with_dates.csv]]}, projection=[empno, salary], file_type=csv, has_header=true diff --git a/datafusion/substrait/Cargo.toml b/datafusion/substrait/Cargo.toml index 605dfc15be3f..8417bd56852f 100644 --- a/datafusion/substrait/Cargo.toml +++ b/datafusion/substrait/Cargo.toml @@ -40,7 +40,7 @@ itertools = { workspace = true } object_store = { workspace = true } pbjson-types = { workspace = true } prost = { workspace = true } -substrait = { version = "0.58", features = ["serde"] } +substrait = { version = "0.59", features = ["serde"] } url = { workspace = true } tokio = { workspace = true, features = ["fs"] } uuid = { version = "1.17.0", features = ["v4"] } diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 6b9cb0843c53..4174fef7a692 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -25,6 +25,15 @@ You can see the current [status of the `51.0.0`release here](https://github.com/apache/datafusion/issues/17558) +### `arrow` / `parquet` updated to 57.0.0 + +### Upgrade to arrow `57.0.0` and parquet `57.0.0` + +This version of DataFusion upgrades the underlying Apache Arrow implementation +to version `57.0.0`, including several dependent crates such as `prost`, +`tonic`, `pyo3`, and `substrait`. . See the [release +notes](https://github.com/apache/arrow-rs/releases/tag/57.0.0) for more details. + ### `MSRV` updated to 1.87.0 The Minimum Supported Rust Version (MSRV) has been updated to [`1.87.0`]. diff --git a/docs/source/user-guide/sql/data_types.md b/docs/source/user-guide/sql/data_types.md index d977a4396e40..02edb6371ce3 100644 --- a/docs/source/user-guide/sql/data_types.md +++ b/docs/source/user-guide/sql/data_types.md @@ -41,7 +41,18 @@ You can cast a SQL expression to a specific Arrow type using the `arrow_cast` fu For example, to cast the output of `now()` to a `Timestamp` with second precision: ```sql -select arrow_cast(now(), 'Timestamp(Second, None)'); +select arrow_cast(now(), 'Timestamp(s)') as "now()"; ++---------------------+ +| now() | ++---------------------+ +| 2025-10-24T20:02:45 | ++---------------------+ +``` + +The older syntax still works as well: + +```sql +select arrow_cast(now(), 'Timestamp(Second, None)') as "now()"; +---------------------+ | now() | +---------------------+ diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 30e10a84fd8e..f6a49c2f1763 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -5003,16 +5003,26 @@ arrow_cast(expression, datatype) #### Example ```sql -> select arrow_cast(-5, 'Int8') as a, +> select + arrow_cast(-5, 'Int8') as a, arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b, - arrow_cast('bar', 'LargeUtf8') as c, - arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d - ; -+----+-----+-----+---------------------------+ -| a | b | c | d | -+----+-----+-----+---------------------------+ -| -5 | foo | bar | 2023-01-02T12:53:02+08:00 | -+----+-----+-----+---------------------------+ + arrow_cast('bar', 'LargeUtf8') as c; + ++----+-----+-----+ +| a | b | c | ++----+-----+-----+ +| -5 | foo | bar | ++----+-----+-----+ + +> select + arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs, "+08:00")') as d, + arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs)') as e; + ++---------------------------+---------------------+ +| d | e | ++---------------------------+---------------------+ +| 2023-01-02T12:53:02+08:00 | 2023-01-02T12:53:02 | ++---------------------------+---------------------+ ``` ### `arrow_typeof` From 561e00b19dac4a53e1714a534f40b147e565c6cb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Oct 2025 09:42:07 -0700 Subject: [PATCH 021/157] chore(deps): bump syn from 2.0.106 to 2.0.108 (#18291) Bumps [syn](https://github.com/dtolnay/syn) from 2.0.106 to 2.0.108.
Release notes

Sourced from syn's releases.

2.0.108

  • Parse unrecognized or invalid literals as Lit::Verbatim (#1925)

2.0.107

  • Improve panic message when constructing a LitInt, LitFloat, or Lit from invalid syntax (#1917)
  • Improve panic message on Punctuated index out of bounds (#1922)
Commits
  • 7a7e331 Release 2.0.108
  • 30463af Merge pull request #1926 from dtolnay/litfuzz
  • 1cc9167 Add fuzzer for literal parsing
  • c49e1d3 Merge pull request #1925 from dtolnay/litparse
  • d047536 Report unexpected verbatim literals in test
  • ce97767 Parse unrecognized or invalid literals as Lit::Verbatim
  • e4a8957 Release 2.0.107
  • 1792e83 Merge pull request #1922 from dtolnay/outofbounds
  • 532e4af Improve panic message on Punctuated index out of bounds
  • 909c222 Add test of Punctuated indexing
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=syn&package-manager=cargo&previous-version=2.0.106&new-version=2.0.108)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 110 +++++++++++++++++------------------ datafusion/macros/Cargo.toml | 2 +- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 55c334e157db..aaa75ecf3247 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -536,7 +536,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -547,7 +547,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1017,7 +1017,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1147,7 +1147,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1170,7 +1170,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1425,7 +1425,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1740,7 +1740,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -1751,7 +1751,7 @@ checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ "darling_core", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -2374,7 +2374,7 @@ version = "50.3.0" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -2767,7 +2767,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -2823,7 +2823,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -2861,7 +2861,7 @@ checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -3122,7 +3122,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -3833,7 +3833,7 @@ checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -4427,7 +4427,7 @@ dependencies = [ "regex", "regex-syntax", "structmeta", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -4555,7 +4555,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -4628,7 +4628,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -4703,7 +4703,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -4774,7 +4774,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.106", + "syn 2.0.108", "tempfile", ] @@ -4788,7 +4788,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -4883,7 +4883,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -4896,7 +4896,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5107,7 +5107,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5156,7 +5156,7 @@ checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5339,7 +5339,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.106", + "syn 2.0.108", "unicode-ident", ] @@ -5351,7 +5351,7 @@ checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14" dependencies = [ "quote", "rand 0.8.5", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5554,7 +5554,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5645,7 +5645,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5656,7 +5656,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5680,7 +5680,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5692,7 +5692,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5736,7 +5736,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5912,7 +5912,7 @@ checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5960,7 +5960,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -5971,7 +5971,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6020,7 +6020,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6032,7 +6032,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6066,7 +6066,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.106", + "syn 2.0.108", "typify", "walkdir", ] @@ -6090,9 +6090,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.106" +version = "2.0.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" dependencies = [ "proc-macro2", "quote", @@ -6116,7 +6116,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6233,7 +6233,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6356,7 +6356,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6572,7 +6572,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -6680,7 +6680,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.106", + "syn 2.0.108", "thiserror", "unicode-ident", ] @@ -6698,7 +6698,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.106", + "syn 2.0.108", "typify-impl", ] @@ -6894,7 +6894,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", "wasm-bindgen-shared", ] @@ -6929,7 +6929,7 @@ checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -6964,7 +6964,7 @@ checksum = "b673bca3298fe582aeef8352330ecbad91849f85090805582400850f8270a2e8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -7109,7 +7109,7 @@ checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -7120,7 +7120,7 @@ checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -7436,7 +7436,7 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", "synstructure", ] @@ -7457,7 +7457,7 @@ checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] @@ -7477,7 +7477,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", "synstructure", ] @@ -7517,7 +7517,7 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.108", ] [[package]] diff --git a/datafusion/macros/Cargo.toml b/datafusion/macros/Cargo.toml index fe979720bc56..64781ddeaf42 100644 --- a/datafusion/macros/Cargo.toml +++ b/datafusion/macros/Cargo.toml @@ -43,4 +43,4 @@ proc-macro = true [dependencies] datafusion-doc = { workspace = true } quote = "1.0.41" -syn = { version = "2.0.106", features = ["full"] } +syn = { version = "2.0.108", features = ["full"] } From 1cb226e78ea4bdf284917fc1bfced4d6903e3326 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 27 Oct 2025 14:47:36 -0400 Subject: [PATCH 022/157] Let `FileScanConfig` own a list of `ProjectionExpr`s (#18253) ## Which issue does this PR close? - Related to https://github.com/apache/datafusion/issues/14993 ## Rationale for this change To enable expression pushdown to file sources, we need to plumb expressions through the `FileScanConfig` layer. Currently, `FileScanConfig` only tracks column indices for projection, which limits us to simple and naive column selection. This PR begins expression pushdown implementation by having `FileScanConfig` own a list of `ProjectionExpr`s, instead of column indices. This allows file sources to eventually receive and optimize based on the actual expressions being projected. ## Notes about this PR - The first commit is based off of https://github.com/apache/datafusion/pull/18231 - To avoid a super large diff and a harder review, I've decided to break (#14993) into 2 tasks: - Have the `DataSource` (`FileScanConfig`) actually hold projection expressions (this PR) - Flow the projection expressions from `DataSourceExec` all the way to the `FileSource` --------- Co-authored-by: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> --- .../examples/advanced_parquet_index.rs | 2 +- .../examples/csv_json_opener.rs | 4 +- .../examples/default_column_values.rs | 2 +- datafusion-examples/examples/parquet_index.rs | 2 +- datafusion/catalog-listing/src/table.rs | 2 +- .../core/src/datasource/file_format/mod.rs | 2 +- .../core/src/datasource/physical_plan/avro.rs | 6 +- .../core/src/datasource/physical_plan/csv.rs | 6 +- .../core/src/datasource/physical_plan/json.rs | 4 +- .../src/datasource/physical_plan/parquet.rs | 4 +- .../core/tests/parquet/schema_coercion.rs | 2 +- .../filter_pushdown/util.rs | 2 +- .../physical_optimizer/projection_pushdown.rs | 6 +- datafusion/datasource/src/file_scan_config.rs | 135 +++++++++---- datafusion/datasource/src/table_schema.rs | 4 + datafusion/physical-expr/src/projection.rs | 186 ++++++++++++++---- datafusion/physical-plan/src/projection.rs | 8 +- .../proto/src/physical_plan/from_proto.rs | 2 +- .../proto/src/physical_plan/to_proto.rs | 5 +- .../tests/cases/roundtrip_physical_plan.rs | 4 +- .../substrait/src/physical_plan/consumer.rs | 4 +- .../substrait/src/physical_plan/producer.rs | 7 +- docs/source/library-user-guide/upgrading.md | 51 +++++ 23 files changed, 334 insertions(+), 116 deletions(-) diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index 55400e219283..1c560be6d08a 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -502,7 +502,7 @@ impl TableProvider for IndexTableProvider { let file_scan_config = FileScanConfigBuilder::new(object_store_url, schema, file_source) .with_limit(limit) - .with_projection(projection.cloned()) + .with_projection_indices(projection.cloned()) .with_file(partitioned_file) .build(); diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/csv_json_opener.rs index 1a2c2cbff418..8abed90238d4 100644 --- a/datafusion-examples/examples/csv_json_opener.rs +++ b/datafusion-examples/examples/csv_json_opener.rs @@ -60,7 +60,7 @@ async fn csv_opener() -> Result<()> { Arc::clone(&schema), Arc::new(CsvSource::default()), ) - .with_projection(Some(vec![12, 0])) + .with_projection_indices(Some(vec![12, 0])) .with_limit(Some(5)) .with_file(PartitionedFile::new(path.display().to_string(), 10)) .build(); @@ -126,7 +126,7 @@ async fn json_opener() -> Result<()> { schema, Arc::new(JsonSource::default()), ) - .with_projection(Some(vec![1, 0])) + .with_projection_indices(Some(vec![1, 0])) .with_limit(Some(5)) .with_file(PartitionedFile::new(path.to_string(), 10)) .build(); diff --git a/datafusion-examples/examples/default_column_values.rs b/datafusion-examples/examples/default_column_values.rs index 43e2d4ca0988..d3a7d2ec67f3 100644 --- a/datafusion-examples/examples/default_column_values.rs +++ b/datafusion-examples/examples/default_column_values.rs @@ -260,7 +260,7 @@ impl TableProvider for DefaultValueTableProvider { self.schema.clone(), Arc::new(parquet_source), ) - .with_projection(projection.cloned()) + .with_projection_indices(projection.cloned()) .with_limit(limit) .with_file_group(file_group) .with_expr_adapter(Some(Arc::new(DefaultValuePhysicalExprAdapterFactory) as _)); diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index afc3b279f4a9..127c55da982c 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -246,7 +246,7 @@ impl TableProvider for IndexTableProvider { let source = Arc::new(ParquetSource::default().with_predicate(predicate)); let mut file_scan_config_builder = FileScanConfigBuilder::new(object_store_url, self.schema(), source) - .with_projection(projection.cloned()) + .with_projection_indices(projection.cloned()) .with_limit(limit); // Transform to the format needed to pass to DataSourceExec diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs index e9ac1bf097a2..95f9523d4401 100644 --- a/datafusion/catalog-listing/src/table.rs +++ b/datafusion/catalog-listing/src/table.rs @@ -499,7 +499,7 @@ impl TableProvider for ListingTable { .with_file_groups(partitioned_file_lists) .with_constraints(self.constraints.clone()) .with_statistics(statistics) - .with_projection(projection) + .with_projection_indices(projection) .with_limit(limit) .with_output_ordering(output_ordering) .with_table_partition_cols(table_partition_cols) diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index e165707c2eb0..4881783eeba6 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -90,7 +90,7 @@ pub(crate) mod test_util { ) .with_file_groups(file_groups) .with_statistics(statistics) - .with_projection(projection) + .with_projection_indices(projection) .with_limit(limit) .build(), ) diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 8a00af959ccc..9068c9758179 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -88,7 +88,7 @@ mod tests { source, ) .with_file(meta.into()) - .with_projection(Some(vec![0, 1, 2])) + .with_projection_indices(Some(vec![0, 1, 2])) .build(); let source_exec = DataSourceExec::from_data_source(conf); @@ -160,7 +160,7 @@ mod tests { let source = Arc::new(AvroSource::new()); let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) .with_file(meta.into()) - .with_projection(projection) + .with_projection_indices(projection) .build(); let source_exec = DataSourceExec::from_data_source(conf); @@ -231,7 +231,7 @@ mod tests { let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) // select specific columns of the files as well as the partitioning // column which is supposed to be the last column in the table schema. - .with_projection(projection) + .with_projection_indices(projection) .with_file(partitioned_file) .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) .build(); diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index b2ef51a76f89..4f46a57d8b13 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -118,7 +118,7 @@ mod tests { )) .with_file_compression_type(file_compression_type) .with_newlines_in_values(false) - .with_projection(Some(vec![0, 2, 4])) + .with_projection_indices(Some(vec![0, 2, 4])) .build(); assert_eq!(13, config.file_schema().fields().len()); @@ -183,7 +183,7 @@ mod tests { )) .with_newlines_in_values(false) .with_file_compression_type(file_compression_type.to_owned()) - .with_projection(Some(vec![4, 0, 2])) + .with_projection_indices(Some(vec![4, 0, 2])) .build(); assert_eq!(13, config.file_schema().fields().len()); let csv = DataSourceExec::from_data_source(config); @@ -373,7 +373,7 @@ mod tests { .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) // We should be able to project on the partition column // Which is supposed to be after the file fields - .with_projection(Some(vec![0, num_file_schema_fields])) + .with_projection_indices(Some(vec![0, num_file_schema_fields])) .build(); // we don't have `/date=xx/` in the path but that is ok because diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 0d45711c76fb..f7d5c710bf48 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -297,7 +297,7 @@ mod tests { let source = Arc::new(JsonSource::new()); let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) .with_file_groups(file_groups) - .with_projection(Some(vec![0, 2])) + .with_projection_indices(Some(vec![0, 2])) .with_file_compression_type(file_compression_type.to_owned()) .build(); let exec = DataSourceExec::from_data_source(conf); @@ -345,7 +345,7 @@ mod tests { let source = Arc::new(JsonSource::new()); let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) .with_file_groups(file_groups) - .with_projection(Some(vec![3, 0, 2])) + .with_projection_indices(Some(vec![3, 0, 2])) .with_file_compression_type(file_compression_type.to_owned()) .build(); let exec = DataSourceExec::from_data_source(conf); diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 10a475c1cc9a..6df5cd7ac68f 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -201,7 +201,7 @@ mod tests { source, ) .with_file_group(file_group) - .with_projection(self.projection.clone()) + .with_projection_indices(self.projection.clone()) .build(); DataSourceExec::from_data_source(base_config) } @@ -1655,7 +1655,7 @@ mod tests { let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source) .with_file(partitioned_file) // file has 10 cols so index 12 should be month and 13 should be day - .with_projection(Some(vec![0, 1, 2, 12, 13])) + .with_projection_indices(Some(vec![0, 1, 2, 12, 13])) .with_table_partition_cols(vec![ Field::new("year", DataType::Utf8, false), Field::new("month", DataType::UInt8, false), diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index 59cbf4b0872e..9be391a9108e 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -126,7 +126,7 @@ async fn multi_parquet_coercion_projection() { Arc::new(ParquetSource::default()), ) .with_file_group(file_group) - .with_projection(Some(vec![1, 0, 2])) + .with_projection_indices(Some(vec![1, 0, 2])) .build(); let parquet_exec = DataSourceExec::from_data_source(config); diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs index f05f3f00281d..54e8e7bf04da 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs @@ -165,7 +165,7 @@ impl FileSource for TestSource { fn with_projection(&self, config: &FileScanConfig) -> Arc { Arc::new(TestSource { - projection: config.projection.clone(), + projection: config.projection_exprs.as_ref().map(|p| p.column_indices()), ..self.clone() }) } diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index c51a5e02c9c3..8631613c3925 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -390,7 +390,7 @@ fn create_simple_csv_exec() -> Arc { Arc::new(CsvSource::new(false, 0, 0)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection(Some(vec![0, 1, 2, 3, 4])) + .with_projection_indices(Some(vec![0, 1, 2, 3, 4])) .build(); DataSourceExec::from_data_source(config) @@ -409,7 +409,7 @@ fn create_projecting_csv_exec() -> Arc { Arc::new(CsvSource::new(false, 0, 0)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_projection(Some(vec![3, 2, 1])) + .with_projection_indices(Some(vec![3, 2, 1])) .build(); DataSourceExec::from_data_source(config) @@ -1596,7 +1596,7 @@ fn partitioned_data_source() -> Arc { ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_table_partition_cols(vec![Field::new("partition_col", DataType::Utf8, true)]) - .with_projection(Some(vec![0, 1, 2])) + .with_projection_indices(Some(vec![0, 1, 2])) .build(); DataSourceExec::from_data_source(config) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 695252803bae..c52397d9a7cc 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -44,18 +44,20 @@ use datafusion_execution::{ object_store::ObjectStoreUrl, SendableRecordBatchStream, TaskContext, }; use datafusion_expr::Operator; -use datafusion_physical_expr::expressions::BinaryExpr; -use datafusion_physical_expr::{expressions::Column, utils::reassign_expr_columns}; +use datafusion_physical_expr::expressions::{BinaryExpr, Column}; +use datafusion_physical_expr::projection::ProjectionExprs; +use datafusion_physical_expr::utils::reassign_expr_columns; use datafusion_physical_expr::{split_conjunction, EquivalenceProperties, Partitioning}; use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use datafusion_physical_plan::projection::ProjectionExpr; +use datafusion_physical_plan::projection::{ + all_alias_free_columns, new_projections_for_columns, ProjectionExpr, +}; use datafusion_physical_plan::{ display::{display_orderings, ProjectSchemaDisplay}, filter_pushdown::FilterPushdownPropagation, metrics::ExecutionPlanMetricsSet, - projection::{all_alias_free_columns, new_projections_for_columns}, DisplayAs, DisplayFormatType, }; use std::{ @@ -124,7 +126,7 @@ use log::{debug, warn}; /// let file_source = Arc::new(ParquetSource::new()); /// let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source) /// .with_limit(Some(1000)) // read only the first 1000 records -/// .with_projection(Some(vec![2, 3])) // project columns 2 and 3 +/// .with_projection_indices(Some(vec![2, 3])) // project columns 2 and 3 /// // Read /tmp/file1.parquet with known size of 1234 bytes in a single group /// .with_file(PartitionedFile::new("file1.parquet", 1234)) /// // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes @@ -175,9 +177,12 @@ pub struct FileScanConfig { pub file_groups: Vec, /// Table constraints pub constraints: Constraints, - /// Columns on which to project the data. Indexes that are higher than the - /// number of columns of `file_schema` refer to `table_partition_cols`. - pub projection: Option>, + /// Physical expressions defining the projection to apply when reading data. + /// + /// Each expression in the projection can reference columns from both the file + /// schema and table partition columns. If `None`, all columns from the table + /// schema are projected. + pub projection_exprs: Option, /// The maximum number of records to read from this plan. If `None`, /// all records after filtering are returned. pub limit: Option, @@ -229,7 +234,7 @@ pub struct FileScanConfig { /// // Set a limit of 1000 rows /// .with_limit(Some(1000)) /// // Project only the first column -/// .with_projection(Some(vec![0])) +/// .with_projection_indices(Some(vec![0])) /// // Add partition columns /// .with_table_partition_cols(vec![ /// Field::new("date", DataType::Utf8, false), @@ -261,7 +266,7 @@ pub struct FileScanConfigBuilder { table_schema: TableSchema, file_source: Arc, limit: Option, - projection: Option>, + projection_indices: Option>, constraints: Option, file_groups: Vec, statistics: Option, @@ -294,7 +299,7 @@ impl FileScanConfigBuilder { file_compression_type: None, new_lines_in_values: None, limit: None, - projection: None, + projection_indices: None, constraints: None, batch_size: None, expr_adapter_factory: None, @@ -317,10 +322,25 @@ impl FileScanConfigBuilder { self } + pub fn table_schema(&self) -> &SchemaRef { + self.table_schema.table_schema() + } + /// Set the columns on which to project the data. Indexes that are higher than the /// number of columns of `file_schema` refer to `table_partition_cols`. - pub fn with_projection(mut self, projection: Option>) -> Self { - self.projection = projection; + /// + /// # Deprecated + /// Use [`Self::with_projection_indices`] instead. This method will be removed in a future release. + #[deprecated(since = "51.0.0", note = "Use with_projection_indices instead")] + pub fn with_projection(self, indices: Option>) -> Self { + self.with_projection_indices(indices) + } + + /// Set the columns on which to project the data using column indices. + /// + /// Indexes that are higher than the number of columns of `file_schema` refer to `table_partition_cols`. + pub fn with_projection_indices(mut self, indices: Option>) -> Self { + self.projection_indices = indices; self } @@ -433,7 +453,7 @@ impl FileScanConfigBuilder { table_schema, file_source, limit, - projection, + projection_indices, constraints, file_groups, statistics, @@ -455,12 +475,18 @@ impl FileScanConfigBuilder { file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); let new_lines_in_values = new_lines_in_values.unwrap_or(false); + // Convert projection indices to ProjectionExprs using the final table schema + // (which now includes partition columns if they were added) + let projection_exprs = projection_indices.map(|indices| { + ProjectionExprs::from_indices(&indices, table_schema.table_schema()) + }); + FileScanConfig { object_store_url, table_schema, file_source, limit, - projection, + projection_exprs, constraints, file_groups, output_ordering, @@ -484,7 +510,9 @@ impl From for FileScanConfigBuilder { file_compression_type: Some(config.file_compression_type), new_lines_in_values: Some(config.new_lines_in_values), limit: config.limit, - projection: config.projection, + projection_indices: config + .projection_exprs + .map(|p| p.ordered_column_indices()), constraints: Some(config.constraints), batch_size: config.batch_size, expr_adapter_factory: config.expr_adapter_factory, @@ -673,15 +701,16 @@ impl DataSource for FileScanConfig { let new_projections = new_projections_for_columns( projection, &file_scan - .projection - .clone() + .projection_exprs + .as_ref() + .map(|p| p.ordered_column_indices()) .unwrap_or_else(|| (0..self.file_schema().fields().len()).collect()), ); Arc::new( FileScanConfigBuilder::from(file_scan) // Assign projected statistics to source - .with_projection(Some(new_projections)) + .with_projection_indices(Some(new_projections)) .with_source(source) .build(), ) as _ @@ -727,8 +756,8 @@ impl FileScanConfig { } fn projection_indices(&self) -> Vec { - match &self.projection { - Some(proj) => proj.clone(), + match &self.projection_exprs { + Some(proj) => proj.ordered_column_indices(), None => (0..self.file_schema().fields().len() + self.table_partition_cols().len()) .collect(), @@ -825,7 +854,7 @@ impl FileScanConfig { /// Project the schema, constraints, and the statistics on the given column indices pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec) { - if self.projection.is_none() && self.table_partition_cols().is_empty() { + if self.projection_exprs.is_none() && self.table_partition_cols().is_empty() { return ( Arc::clone(self.file_schema()), self.constraints.clone(), @@ -844,12 +873,17 @@ impl FileScanConfig { } pub fn projected_file_column_names(&self) -> Option> { - self.projection.as_ref().map(|p| { - p.iter() - .filter(|col_idx| **col_idx < self.file_schema().fields().len()) - .map(|col_idx| self.file_schema().field(*col_idx).name()) + let fields = self.file_schema().fields(); + + self.projection_exprs.as_ref().map(|p| { + let column_indices = p.ordered_column_indices(); + + column_indices + .iter() + .filter(|&&col_i| col_i < fields.len()) + .map(|&col_i| self.file_schema().field(col_i).name()) .cloned() - .collect() + .collect::>() }) } @@ -875,11 +909,11 @@ impl FileScanConfig { } pub fn file_column_projection_indices(&self) -> Option> { - self.projection.as_ref().map(|p| { - p.iter() - .filter(|col_idx| **col_idx < self.file_schema().fields().len()) - .copied() - .collect() + self.projection_exprs.as_ref().map(|p| { + p.ordered_column_indices() + .into_iter() + .filter(|&i| i < self.file_schema().fields().len()) + .collect::>() }) } @@ -1415,10 +1449,15 @@ fn get_projected_output_ordering( return false; } + let indices = base_config + .projection_exprs + .as_ref() + .map(|p| p.ordered_column_indices()); + let statistics = match MinMaxStatistics::new_from_files( &new_ordering, projected_schema, - base_config.projection.as_deref(), + indices.as_deref(), group.iter(), ) { Ok(statistics) => statistics, @@ -1479,7 +1518,7 @@ mod tests { use datafusion_common::{assert_batches_eq, internal_err}; use datafusion_expr::{Operator, SortExpr}; use datafusion_physical_expr::create_physical_sort_expr; - use datafusion_physical_expr::expressions::{BinaryExpr, Literal}; + use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal}; use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; /// Returns the column names on the schema @@ -2143,7 +2182,7 @@ mod tests { file_schema, Arc::new(MockSource::default()), ) - .with_projection(projection) + .with_projection_indices(projection) .with_statistics(statistics) .with_table_partition_cols(table_partition_cols) .build() @@ -2196,7 +2235,7 @@ mod tests { // Build with various configurations let config = builder .with_limit(Some(1000)) - .with_projection(Some(vec![0, 1])) + .with_projection_indices(Some(vec![0, 1])) .with_table_partition_cols(vec![Field::new( "date", wrap_partition_type_in_dict(DataType::Utf8), @@ -2219,7 +2258,10 @@ mod tests { assert_eq!(config.object_store_url, object_store_url); assert_eq!(*config.file_schema(), file_schema); assert_eq!(config.limit, Some(1000)); - assert_eq!(config.projection, Some(vec![0, 1])); + assert_eq!( + config.projection_exprs.as_ref().map(|p| p.column_indices()), + Some(vec![0, 1]) + ); assert_eq!(config.table_partition_cols().len(), 1); assert_eq!(config.table_partition_cols()[0].name(), "date"); assert_eq!(config.file_groups.len(), 1); @@ -2253,7 +2295,7 @@ mod tests { Arc::clone(&file_schema), Arc::clone(&file_source), ) - .with_projection(Some(vec![0, 1, 2])) + .with_projection_indices(Some(vec![0, 1, 2])) .build(); // Simulate projection being updated. Since the filter has already been pushed down, @@ -2302,7 +2344,10 @@ mod tests { assert_eq!(config.object_store_url, object_store_url); assert_eq!(*config.file_schema(), file_schema); assert_eq!(config.limit, None); - assert_eq!(config.projection, None); + assert_eq!( + config.projection_exprs.as_ref().map(|p| p.column_indices()), + None + ); assert!(config.table_partition_cols().is_empty()); assert!(config.file_groups.is_empty()); assert_eq!( @@ -2357,7 +2402,7 @@ mod tests { Arc::clone(&schema), Arc::clone(&file_source), ) - .with_projection(Some(vec![0, 2])) + .with_projection_indices(Some(vec![0, 2])) .with_limit(Some(10)) .with_table_partition_cols(partition_cols.clone()) .with_file(file.clone()) @@ -2375,7 +2420,13 @@ mod tests { let partition_cols = partition_cols.into_iter().map(Arc::new).collect::>(); assert_eq!(new_config.object_store_url, object_store_url); assert_eq!(*new_config.file_schema(), schema); - assert_eq!(new_config.projection, Some(vec![0, 2])); + assert_eq!( + new_config + .projection_exprs + .as_ref() + .map(|p| p.column_indices()), + Some(vec![0, 2]) + ); assert_eq!(new_config.limit, Some(10)); assert_eq!(*new_config.table_partition_cols(), partition_cols); assert_eq!(new_config.file_groups.len(), 1); @@ -2594,7 +2645,7 @@ mod tests { Arc::clone(&schema), Arc::new(MockSource::default()), ) - .with_projection(Some(vec![0, 2])) // Only project columns 0 and 2 + .with_projection_indices(Some(vec![0, 2])) // Only project columns 0 and 2 .with_file_groups(vec![file_group]) .build(); diff --git a/datafusion/datasource/src/table_schema.rs b/datafusion/datasource/src/table_schema.rs index 8e95585ce873..863c123e3b1d 100644 --- a/datafusion/datasource/src/table_schema.rs +++ b/datafusion/datasource/src/table_schema.rs @@ -132,6 +132,10 @@ impl TableSchema { table_partition_cols: Vec, ) -> TableSchema { self.table_partition_cols = table_partition_cols; + // Rebuild the table schema with the new partition columns + let mut builder = SchemaBuilder::from(self.file_schema.as_ref()); + builder.extend(self.table_partition_cols.iter().cloned()); + self.table_schema = Arc::new(builder.finish()); self } diff --git a/datafusion/physical-expr/src/projection.rs b/datafusion/physical-expr/src/projection.rs index e35bfbb3a20d..fc972d644e67 100644 --- a/datafusion/physical-expr/src/projection.rs +++ b/datafusion/physical-expr/src/projection.rs @@ -100,24 +100,24 @@ impl From for (Arc, String) { /// representing a complete projection operation and provides /// methods to manipulate and analyze the projection as a whole. #[derive(Debug, Clone)] -pub struct Projection { +pub struct ProjectionExprs { exprs: Vec, } -impl std::fmt::Display for Projection { +impl std::fmt::Display for ProjectionExprs { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let exprs: Vec = self.exprs.iter().map(|e| e.to_string()).collect(); write!(f, "Projection[{}]", exprs.join(", ")) } } -impl From> for Projection { +impl From> for ProjectionExprs { fn from(value: Vec) -> Self { Self { exprs: value } } } -impl From<&[ProjectionExpr]> for Projection { +impl From<&[ProjectionExpr]> for ProjectionExprs { fn from(value: &[ProjectionExpr]) -> Self { Self { exprs: value.to_vec(), @@ -125,15 +125,83 @@ impl From<&[ProjectionExpr]> for Projection { } } -impl AsRef<[ProjectionExpr]> for Projection { +impl FromIterator for ProjectionExprs { + fn from_iter>(exprs: T) -> Self { + Self { + exprs: exprs.into_iter().collect::>(), + } + } +} + +impl AsRef<[ProjectionExpr]> for ProjectionExprs { fn as_ref(&self) -> &[ProjectionExpr] { &self.exprs } } -impl Projection { - pub fn new(exprs: Vec) -> Self { - Self { exprs } +impl ProjectionExprs { + pub fn new(exprs: I) -> Self + where + I: IntoIterator, + { + Self { + exprs: exprs.into_iter().collect::>(), + } + } + + /// Creates a [`ProjectionExpr`] from a list of column indices. + /// + /// This is a convenience method for creating simple column-only projections, where each projection expression is a reference to a column + /// in the input schema. + /// + /// # Behavior + /// - Ordering: the output projection preserves the exact order of indices provided in the input slice + /// For example, `[2, 0, 1]` will produce projections for columns 2, 0, then 1 in that order + /// - Duplicates: Duplicate indices are allowed and will create multiple projection expressions referencing the same source column + /// For example, `[0, 0]` creates 2 separate projections both referencing column 0 + /// + /// # Panics + /// Panics if any index in `indices` is out of bounds for the provided schema. + /// + /// # Example + /// + /// ```rust + /// use std::sync::Arc; + /// use arrow::datatypes::{Schema, Field, DataType}; + /// use datafusion_physical_expr::projection::ProjectionExprs; + /// + /// // Create a schema with three columns + /// let schema = Arc::new(Schema::new(vec![ + /// Field::new("a", DataType::Int32, false), + /// Field::new("b", DataType::Utf8, false), + /// Field::new("c", DataType::Float64, false), + /// ])); + /// + /// // Project columns at indices 2 and 0 (c and a) - ordering is preserved + /// let projection = ProjectionExprs::from_indices(&[2, 0], &schema); + /// + /// // This creates: SELECT c@2 AS c, a@0 AS a + /// assert_eq!(projection.as_ref().len(), 2); + /// assert_eq!(projection.as_ref()[0].alias, "c"); + /// assert_eq!(projection.as_ref()[1].alias, "a"); + /// + /// // Duplicate indices are allowed + /// let projection_with_dups = ProjectionExprs::from_indices(&[0, 0, 1], &schema); + /// assert_eq!(projection_with_dups.as_ref().len(), 3); + /// assert_eq!(projection_with_dups.as_ref()[0].alias, "a"); + /// assert_eq!(projection_with_dups.as_ref()[1].alias, "a"); // duplicate + /// assert_eq!(projection_with_dups.as_ref()[2].alias, "b"); + /// ``` + pub fn from_indices(indices: &[usize], schema: &SchemaRef) -> Self { + let projection_exprs = indices.iter().map(|&i| { + let field = schema.field(i); + ProjectionExpr { + expr: Arc::new(Column::new(field.name(), i)), + alias: field.name().clone(), + } + }); + + Self::from_iter(projection_exprs) } /// Returns an iterator over the projection expressions @@ -167,7 +235,7 @@ impl Projection { /// /// ```rust /// use std::sync::Arc; - /// use datafusion_physical_expr::projection::{Projection, ProjectionExpr}; + /// use datafusion_physical_expr::projection::{ProjectionExprs, ProjectionExpr}; /// use datafusion_physical_expr::expressions::{Column, BinaryExpr, Literal}; /// use datafusion_common::{Result, ScalarValue}; /// use datafusion_expr::Operator; @@ -175,7 +243,7 @@ impl Projection { /// fn main() -> Result<()> { /// // Example from the docstring: /// // Base projection: SELECT c@2 AS x, b@1 AS y, a@0 AS z - /// let base = Projection::new(vec![ + /// let base = ProjectionExprs::new(vec![ /// ProjectionExpr { /// expr: Arc::new(Column::new("c", 2)), /// alias: "x".to_string(), @@ -191,7 +259,7 @@ impl Projection { /// ]); /// /// // Top projection: SELECT x@0 + 1 AS c1, y@1 + z@2 AS c2 - /// let top = Projection::new(vec![ + /// let top = ProjectionExprs::new(vec![ /// ProjectionExpr { /// expr: Arc::new(BinaryExpr::new( /// Arc::new(Column::new("x", 0)), @@ -224,7 +292,7 @@ impl Projection { /// # Errors /// This function returns an error if any expression in the `other` projection cannot be /// applied on top of this projection. - pub fn try_merge(&self, other: &Projection) -> Result { + pub fn try_merge(&self, other: &ProjectionExprs) -> Result { let mut new_exprs = Vec::with_capacity(other.exprs.len()); for proj_expr in &other.exprs { let new_expr = update_expr(&proj_expr.expr, &self.exprs, true)? @@ -240,7 +308,7 @@ impl Projection { alias: proj_expr.alias.clone(), }); } - Ok(Projection::new(new_exprs)) + Ok(ProjectionExprs::new(new_exprs)) } /// Extract the column indices used in this projection. @@ -256,6 +324,46 @@ impl Projection { .collect_vec() } + /// Extract the ordered column indices for a column-only projection. + /// + /// This function assumes that all expressions in the projection are simple column references. + /// It returns the column indices in the order they appear in the projection. + /// + /// # Panics + /// + /// Panics if any expression in the projection is not a simple column reference. This includes: + /// - Computed expressions (e.g., `a + 1`, `CAST(a AS INT)`) + /// - Function calls (e.g., `UPPER(name)`, `SUM(amount)`) + /// - Literals (e.g., `42`, `'hello'`) + /// - Complex nested expressions (e.g., `CASE WHEN ... THEN ... END`) + /// + /// # Returns + /// + /// A vector of column indices in projection order. Unlike [`column_indices()`](Self::column_indices), + /// this function: + /// - Preserves the projection order (does not sort) + /// - Preserves duplicates (does not deduplicate) + /// + /// # Example + /// + /// For a projection `SELECT c, a, c` where `a` is at index 0 and `c` is at index 2, + /// this function would return `[2, 0, 2]`. + /// + /// Use [`column_indices()`](Self::column_indices) instead if the projection may contain + /// non-column expressions or if you need a deduplicated sorted list. + pub fn ordered_column_indices(&self) -> Vec { + self.exprs + .iter() + .map(|e| { + e.expr + .as_any() + .downcast_ref::() + .expect("Expected column reference in projection") + .index() + }) + .collect() + } + /// Project a schema according to this projection. /// For example, for a projection `SELECT a AS x, b + 1 AS y`, where `a` is at index 0 and `b` is at index 1, /// if the input schema is `[a: Int32, b: Int32, c: Int32]`, the output schema would be `[x: Int32, y: Int32]`. @@ -327,7 +435,7 @@ impl Projection { } } -impl<'a> IntoIterator for &'a Projection { +impl<'a> IntoIterator for &'a ProjectionExprs { type Item = &'a ProjectionExpr; type IntoIter = std::slice::Iter<'a, ProjectionExpr>; @@ -336,7 +444,7 @@ impl<'a> IntoIterator for &'a Projection { } } -impl IntoIterator for Projection { +impl IntoIterator for ProjectionExprs { type Item = ProjectionExpr; type IntoIter = std::vec::IntoIter; @@ -1570,7 +1678,7 @@ pub(crate) mod tests { let source = get_stats(); let schema = get_schema(); - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("col1", 1)), alias: "col1".to_string(), @@ -1612,7 +1720,7 @@ pub(crate) mod tests { let source = get_stats(); let schema = get_schema(); - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("col2", 2)), alias: "col2".to_string(), @@ -1663,7 +1771,7 @@ pub(crate) mod tests { alias: "b".to_string(), }, ]; - let projection = Projection::new(exprs.clone()); + let projection = ProjectionExprs::new(exprs.clone()); assert_eq!(projection.as_ref().len(), 2); Ok(()) } @@ -1674,7 +1782,7 @@ pub(crate) mod tests { expr: Arc::new(Column::new("x", 0)), alias: "x".to_string(), }]; - let projection: Projection = exprs.clone().into(); + let projection: ProjectionExprs = exprs.clone().into(); assert_eq!(projection.as_ref().len(), 1); Ok(()) } @@ -1691,7 +1799,7 @@ pub(crate) mod tests { alias: "col2".to_string(), }, ]; - let projection = Projection::new(exprs); + let projection = ProjectionExprs::new(exprs); let as_ref: &[ProjectionExpr] = projection.as_ref(); assert_eq!(as_ref.len(), 2); Ok(()) @@ -1700,7 +1808,7 @@ pub(crate) mod tests { #[test] fn test_column_indices_multiple_columns() -> Result<()> { // Test with reversed column order to ensure proper reordering - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("c", 5)), alias: "c".to_string(), @@ -1722,7 +1830,7 @@ pub(crate) mod tests { #[test] fn test_column_indices_duplicates() -> Result<()> { // Test that duplicate column indices appear only once - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("a", 1)), alias: "a".to_string(), @@ -1743,7 +1851,7 @@ pub(crate) mod tests { #[test] fn test_column_indices_unsorted() -> Result<()> { // Test that column indices are sorted in the output - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("c", 5)), alias: "c".to_string(), @@ -1769,7 +1877,7 @@ pub(crate) mod tests { Operator::Plus, Arc::new(Column::new("b", 4)), )); - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr, alias: "sum".to_string(), @@ -1786,7 +1894,7 @@ pub(crate) mod tests { #[test] fn test_column_indices_empty() -> Result<()> { - let projection = Projection::new(vec![]); + let projection = ProjectionExprs::new(vec![]); assert_eq!(projection.column_indices(), Vec::::new()); Ok(()) } @@ -1794,7 +1902,7 @@ pub(crate) mod tests { #[test] fn test_merge_simple_columns() -> Result<()> { // First projection: SELECT c@2 AS x, b@1 AS y, a@0 AS z - let base_projection = Projection::new(vec![ + let base_projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("c", 2)), alias: "x".to_string(), @@ -1810,7 +1918,7 @@ pub(crate) mod tests { ]); // Second projection: SELECT y@1 AS col2, x@0 AS col1 - let top_projection = Projection::new(vec![ + let top_projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("y", 1)), alias: "col2".to_string(), @@ -1831,7 +1939,7 @@ pub(crate) mod tests { #[test] fn test_merge_with_expressions() -> Result<()> { // First projection: SELECT c@2 AS x, b@1 AS y, a@0 AS z - let base_projection = Projection::new(vec![ + let base_projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("c", 2)), alias: "x".to_string(), @@ -1847,7 +1955,7 @@ pub(crate) mod tests { ]); // Second projection: SELECT y@1 + z@2 AS c2, x@0 + 1 AS c1 - let top_projection = Projection::new(vec![ + let top_projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(BinaryExpr::new( Arc::new(Column::new("y", 1)), @@ -1876,7 +1984,7 @@ pub(crate) mod tests { #[test] fn try_merge_error() { // Create a base projection - let base = Projection::new(vec![ + let base = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("a", 0)), alias: "x".to_string(), @@ -1888,7 +1996,7 @@ pub(crate) mod tests { ]); // Create a top projection that references a non-existent column index - let top = Projection::new(vec![ProjectionExpr { + let top = ProjectionExprs::new(vec![ProjectionExpr { expr: Arc::new(Column::new("z", 5)), // Invalid index alias: "result".to_string(), }]); @@ -1907,7 +2015,7 @@ pub(crate) mod tests { let input_schema = get_schema(); // Projection: SELECT col2 AS c, col0 AS a - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("col2", 2)), alias: "c".to_string(), @@ -1940,7 +2048,7 @@ pub(crate) mod tests { let input_schema = get_schema(); // Projection: SELECT col0 + 1 AS incremented - let projection = Projection::new(vec![ProjectionExpr { + let projection = ProjectionExprs::new(vec![ProjectionExpr { expr: Arc::new(BinaryExpr::new( Arc::new(Column::new("col0", 0)), Operator::Plus, @@ -1974,7 +2082,7 @@ pub(crate) mod tests { ]); // Projection: SELECT col0 AS renamed - let projection = Projection::new(vec![ProjectionExpr { + let projection = ProjectionExprs::new(vec![ProjectionExpr { expr: Arc::new(Column::new("col0", 0)), alias: "renamed".to_string(), }]); @@ -1994,7 +2102,7 @@ pub(crate) mod tests { #[test] fn test_project_schema_empty() -> Result<()> { let input_schema = get_schema(); - let projection = Projection::new(vec![]); + let projection = ProjectionExprs::new(vec![]); let output_schema = projection.project_schema(&input_schema)?; @@ -2009,7 +2117,7 @@ pub(crate) mod tests { let input_schema = get_schema(); // Projection: SELECT col1 AS text, col0 AS num - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("col1", 1)), alias: "text".to_string(), @@ -2057,7 +2165,7 @@ pub(crate) mod tests { let input_schema = get_schema(); // Projection with expression: SELECT col0 + 1 AS incremented, col1 AS text - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(BinaryExpr::new( Arc::new(Column::new("col0", 0)), @@ -2105,7 +2213,7 @@ pub(crate) mod tests { let input_schema = get_schema(); // Projection with only primitive width columns: SELECT col2 AS f, col0 AS i - let projection = Projection::new(vec![ + let projection = ProjectionExprs::new(vec![ ProjectionExpr { expr: Arc::new(Column::new("col2", 2)), alias: "f".to_string(), @@ -2136,7 +2244,7 @@ pub(crate) mod tests { let input_stats = get_stats(); let input_schema = get_schema(); - let projection = Projection::new(vec![]); + let projection = ProjectionExprs::new(vec![]); let output_stats = projection.project_statistics(input_stats, &input_schema)?; diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 4dc88bc56631..2c84570b33d9 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -53,7 +53,9 @@ use datafusion_physical_expr_common::physical_expr::{fmt_sql, PhysicalExprRef}; use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement}; // Re-exported from datafusion-physical-expr for backwards compatibility // We recommend updating your imports to use datafusion-physical-expr directly -pub use datafusion_physical_expr::projection::{update_expr, Projection, ProjectionExpr}; +pub use datafusion_physical_expr::projection::{ + update_expr, ProjectionExpr, ProjectionExprs, +}; use futures::stream::{Stream, StreamExt}; use log::trace; @@ -65,7 +67,7 @@ use log::trace; #[derive(Debug, Clone)] pub struct ProjectionExec { /// The projection expressions stored as tuples of (expression, output column name) - projection: Projection, + projection: ProjectionExprs, /// The schema once the projection has been applied to the input schema: SchemaRef, /// The input plan @@ -130,7 +132,7 @@ impl ProjectionExec { let input_schema = input.schema(); // convert argument to Vec let expr_vec = expr.into_iter().map(Into::into).collect::>(); - let projection = Projection::new(expr_vec); + let projection = ProjectionExprs::new(expr_vec); let schema = Arc::new(projection.project_schema(&input_schema)?); diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 7c4b9e55b813..2a3906d49347 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -545,7 +545,7 @@ pub fn parse_protobuf_file_scan_config( .with_file_groups(file_groups) .with_constraints(constraints) .with_statistics(statistics) - .with_projection(Some(projection)) + .with_projection_indices(Some(projection)) .with_limit(proto.limit.as_ref().map(|sl| sl.limit as usize)) .with_table_partition_cols(table_partition_cols) .with_output_ordering(output_ordering) diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 399c234191aa..dc0a78dbccf1 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -532,9 +532,10 @@ pub fn serialize_file_scan_config( statistics: Some((&conf.file_source.statistics().unwrap()).into()), limit: conf.limit.map(|l| protobuf::ScanLimit { limit: l as u32 }), projection: conf - .projection + .projection_exprs .as_ref() - .unwrap_or(&(0..schema.fields().len()).collect::>()) + .map(|p| p.column_indices()) + .unwrap_or((0..schema.fields().len()).collect::>()) .iter() .map(|n| *n as u32) .collect(), diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index a0456e2031be..c8b2bc02e447 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -920,7 +920,7 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { schema, file_source, ) - .with_projection(Some(vec![0, 1])) + .with_projection_indices(Some(vec![0, 1])) .with_file_group(FileGroup::new(vec![file_group])) .with_table_partition_cols(vec![Field::new( "part".to_string(), @@ -1814,7 +1814,7 @@ async fn roundtrip_projection_source() -> Result<()> { 1024, )])]) .with_statistics(statistics) - .with_projection(Some(vec![0, 1, 2])) + .with_projection_indices(Some(vec![0, 1, 2])) .build(); let filter = Arc::new( diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs index ecf465dd3f18..45a19cea80cf 100644 --- a/datafusion/substrait/src/physical_plan/consumer.rs +++ b/datafusion/substrait/src/physical_plan/consumer.rs @@ -151,8 +151,8 @@ pub async fn from_substrait_rel( .iter() .map(|item| item.field as usize) .collect(); - base_config_builder = - base_config_builder.with_projection(Some(column_indices)); + base_config_builder = base_config_builder + .with_projection_indices(Some(column_indices)); } } diff --git a/datafusion/substrait/src/physical_plan/producer.rs b/datafusion/substrait/src/physical_plan/producer.rs index 63abd14d6f5e..20d41c2e6112 100644 --- a/datafusion/substrait/src/physical_plan/producer.rs +++ b/datafusion/substrait/src/physical_plan/producer.rs @@ -92,11 +92,12 @@ pub fn to_substrait_rel( }; let mut select_struct = None; - if let Some(projection) = file_config.projection.as_ref() { + if let Some(projection) = file_config.projection_exprs.as_ref() { let struct_items = projection - .iter() + .column_indices() + .into_iter() .map(|index| StructItem { - field: *index as i32, + field: index as i32, // FIXME: duckdb sets this to None, but it's not clear why. // https://github.com/duckdb/substrait/blob/b6f56643cb11d52de0e32c24a01dfd5947df62be/src/to_substrait.cpp#L1191 child: None, diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 4174fef7a692..c568b8b28e1f 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -125,6 +125,57 @@ Users may need to update their paths to account for these changes. See [issue #17713] for more details. +### `FileScanConfig::projection` renamed to `FileScanConfig::projection_exprs` + +The `projection` field in `FileScanConfig` has been renamed to `projection_exprs` and its type has changed from `Option>` to `Option`. This change enables more powerful projection pushdown capabilities by supporting arbitrary physical expressions rather than just column indices. + +**Impact on direct field access:** + +If you directly access the `projection` field: + +```rust +# /* comment to avoid running +let config: FileScanConfig = ...; +let projection = config.projection; +# */ +``` + +You should update to: + +```rust +# /* comment to avoid running +let config: FileScanConfig = ...; +let projection_exprs = config.projection_exprs; +# */ +``` + +**Impact on builders:** + +The `FileScanConfigBuilder::with_projection()` method has been deprecated in favor of `with_projection_indices()`: + +```diff +let config = FileScanConfigBuilder::new(url, schema, file_source) +- .with_projection(Some(vec![0, 2, 3])) ++ .with_projection_indices(Some(vec![0, 2, 3])) + .build(); +``` + +Note: `with_projection()` still works but is deprecated and will be removed in a future release. + +**What is `ProjectionExprs`?** + +`ProjectionExprs` is a new type that represents a list of physical expressions for projection. While it can be constructed from column indices (which is what `with_projection_indices` does internally), it also supports arbitrary physical expressions, enabling advanced features like expression evaluation during scanning. + +You can access column indices from `ProjectionExprs` using its methods if needed: + +```rust +# /* comment to avoid running +let projection_exprs: ProjectionExprs = ...; +// Get the column indices if the projection only contains simple column references +let indices = projection_exprs.column_indices(); +# */ +``` + ### `DESCRIBE query` support `DESCRIBE query` was previously an alias for `EXPLAIN query`, which outputs the From fe54d8748aaa1798a3b8e7902e07f05b97ce1233 Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Tue, 28 Oct 2025 13:17:54 +1100 Subject: [PATCH 023/157] Deduplicate range/gen_series nested functions code (#18198) ## Which issue does this PR close? - Doing some prework for #15881 ## Rationale for this change `Range` and `GenSeries` are essentially the same except for whether they include upper bounds or not; unify their function code to reduce duplication, making future changes easier. ## What changes are included in this PR? Remove `GenSeries` struct, folding it into `Range`. Do some more minor refactoring to their code. ## Are these changes tested? Existing tests (updated some error messages). ## Are there any user-facing changes? Not really (updated some error messages). --- datafusion/functions-nested/src/macros.rs | 25 +- datafusion/functions-nested/src/range.rs | 705 ++++++++---------- datafusion/sqllogictest/test_files/array.slt | 10 +- .../source/user-guide/sql/scalar_functions.md | 14 +- 4 files changed, 334 insertions(+), 420 deletions(-) diff --git a/datafusion/functions-nested/src/macros.rs b/datafusion/functions-nested/src/macros.rs index cec7f2fd562d..5380f6b1272d 100644 --- a/datafusion/functions-nested/src/macros.rs +++ b/datafusion/functions-nested/src/macros.rs @@ -41,10 +41,15 @@ /// * `arg`: 0 or more named arguments for the function /// * `DOC`: documentation string for the function /// * `SCALAR_UDF_FUNC`: name of the function to create (just) the `ScalarUDF` +/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it +/// automatically resolves to `$UDF::new()`. /// /// [`ScalarUDFImpl`]: datafusion_expr::ScalarUDFImpl macro_rules! make_udf_expr_and_func { - ($UDF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr , $SCALAR_UDF_FN:ident) => { + ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $SCALAR_UDF_FN:ident) => { + make_udf_expr_and_func!($UDF, $EXPR_FN, $($arg)*, $DOC, $SCALAR_UDF_FN, $UDF::new); + }; + ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $SCALAR_UDF_FN:ident, $CTOR:path) => { paste::paste! { // "fluent expr_fn" style function #[doc = $DOC] @@ -54,10 +59,13 @@ macro_rules! make_udf_expr_and_func { vec![$($arg),*], )) } - create_func!($UDF, $SCALAR_UDF_FN); + create_func!($UDF, $SCALAR_UDF_FN, $CTOR); } }; - ($UDF:ty, $EXPR_FN:ident, $DOC:expr , $SCALAR_UDF_FN:ident) => { + ($UDF:ident, $EXPR_FN:ident, $DOC:expr, $SCALAR_UDF_FN:ident) => { + make_udf_expr_and_func!($UDF, $EXPR_FN, $DOC, $SCALAR_UDF_FN, $UDF::new); + }; + ($UDF:ident, $EXPR_FN:ident, $DOC:expr, $SCALAR_UDF_FN:ident, $CTOR:path) => { paste::paste! { // "fluent expr_fn" style function #[doc = $DOC] @@ -67,7 +75,7 @@ macro_rules! make_udf_expr_and_func { arg, )) } - create_func!($UDF, $SCALAR_UDF_FN); + create_func!($UDF, $SCALAR_UDF_FN, $CTOR); } }; } @@ -80,10 +88,15 @@ macro_rules! make_udf_expr_and_func { /// # Arguments /// * `UDF`: name of the [`ScalarUDFImpl`] /// * `SCALAR_UDF_FUNC`: name of the function to create (just) the `ScalarUDF` +/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it +/// automatically resolves to `$UDF::new()`. /// /// [`ScalarUDFImpl`]: datafusion_expr::ScalarUDFImpl macro_rules! create_func { - ($UDF:ty, $SCALAR_UDF_FN:ident) => { + ($UDF:ident, $SCALAR_UDF_FN:ident) => { + create_func!($UDF, $SCALAR_UDF_FN, $UDF::new); + }; + ($UDF:ident, $SCALAR_UDF_FN:ident, $CTOR:path) => { paste::paste! { #[doc = concat!("ScalarFunction that returns a [`ScalarUDF`](datafusion_expr::ScalarUDF) for ")] #[doc = stringify!($UDF)] @@ -92,7 +105,7 @@ macro_rules! create_func { static INSTANCE: std::sync::LazyLock> = std::sync::LazyLock::new(|| { std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( - <$UDF>::new(), + $CTOR(), )) }); std::sync::Arc::clone(&INSTANCE) diff --git a/datafusion/functions-nested/src/range.rs b/datafusion/functions-nested/src/range.rs index 619b0e84c19a..01c6e9c43f2e 100644 --- a/datafusion/functions-nested/src/range.rs +++ b/datafusion/functions-nested/src/range.rs @@ -22,20 +22,23 @@ use arrow::array::{ builder::{Date32Builder, TimestampNanosecondBuilder}, temporal_conversions::as_datetime_with_timezone, timezone::Tz, - types::{Date32Type, IntervalMonthDayNanoType, TimestampNanosecondType as TSNT}, - Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullArray, NullBufferBuilder, - TimestampNanosecondArray, + types::{Date32Type, IntervalMonthDayNanoType, TimestampNanosecondType}, + Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullBufferBuilder, }; use arrow::buffer::OffsetBuffer; use arrow::datatypes::{ DataType, DataType::*, Field, IntervalUnit::MonthDayNano, TimeUnit::Nanosecond, }; -use datafusion_common::cast::{ - as_date32_array, as_int64_array, as_interval_mdn_array, as_timestamp_nanosecond_array, +use datafusion_common::{ + cast::{ + as_date32_array, as_int64_array, as_interval_mdn_array, + as_timestamp_nanosecond_array, + }, + DataFusionError, ScalarValue, }; use datafusion_common::{ - exec_datafusion_err, exec_err, internal_err, not_impl_datafusion_err, - utils::take_function_args, Result, + exec_datafusion_err, exec_err, not_impl_datafusion_err, utils::take_function_args, + Result, }; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, @@ -53,13 +56,24 @@ make_udf_expr_and_func!( range, start stop step, "create a list of values in the range between start and stop", - range_udf + range_udf, + Range::new +); + +make_udf_expr_and_func!( + GenSeries, + gen_series, + start stop step, + "create a list of values in the range between start and stop, include upper bound", + gen_series_udf, + Range::generate_series ); #[user_doc( doc_section(label = "Array Functions"), description = "Returns an Arrow array between start and stop with step. The range start..end contains all values with start <= x < end. It is empty if start >= end. Step cannot be 0.", - syntax_example = "range(start, stop, step)", + syntax_example = "range(stop) +range(start, stop[, step])", sql_example = r#"```sql > select range(2, 10, 3); +-----------------------------------+ @@ -69,11 +83,11 @@ make_udf_expr_and_func!( +-----------------------------------+ > select range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH); -+--------------------------------------------------------------+ -| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) | -+--------------------------------------------------------------+ ++--------------------------------------------------------------------------+ +| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) | ++--------------------------------------------------------------------------+ | [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] | -+--------------------------------------------------------------+ ++--------------------------------------------------------------------------+ ```"#, argument( name = "start", @@ -88,115 +102,13 @@ make_udf_expr_and_func!( description = "Increase by step (cannot be 0). Steps less than a day are supported only for timestamp ranges." ) )] -#[derive(Debug, PartialEq, Eq, Hash)] -pub struct Range { - signature: Signature, - aliases: Vec, -} - -impl Default for Range { - fn default() -> Self { - Self::new() - } -} -impl Range { - pub fn new() -> Self { - Self { - signature: Signature::user_defined(Volatility::Immutable), - aliases: vec![], - } - } -} -impl ScalarUDFImpl for Range { - fn as_any(&self) -> &dyn Any { - self - } - fn name(&self) -> &str { - "range" - } - - fn signature(&self) -> &Signature { - &self.signature - } - - fn coerce_types(&self, arg_types: &[DataType]) -> Result> { - arg_types - .iter() - .map(|arg_type| match arg_type { - Null => Ok(Null), - Int8 => Ok(Int64), - Int16 => Ok(Int64), - Int32 => Ok(Int64), - Int64 => Ok(Int64), - UInt8 => Ok(Int64), - UInt16 => Ok(Int64), - UInt32 => Ok(Int64), - UInt64 => Ok(Int64), - Timestamp(_, tz) => Ok(Timestamp(Nanosecond, tz.clone())), - Date32 => Ok(Date32), - Date64 => Ok(Date32), - Utf8 => Ok(Date32), - LargeUtf8 => Ok(Date32), - Utf8View => Ok(Date32), - Interval(_) => Ok(Interval(MonthDayNano)), - _ => exec_err!("Unsupported DataType"), - }) - .try_collect() - } - - fn return_type(&self, arg_types: &[DataType]) -> Result { - if arg_types.iter().any(|t| t.is_null()) { - Ok(Null) - } else { - Ok(List(Arc::new(Field::new_list_field( - arg_types[0].clone(), - true, - )))) - } - } - - fn invoke_with_args( - &self, - args: datafusion_expr::ScalarFunctionArgs, - ) -> Result { - let args = &args.args; - - if args.iter().any(|arg| arg.data_type().is_null()) { - return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1)))); - } - match args[0].data_type() { - Int64 => make_scalar_function(|args| gen_range_inner(args, false))(args), - Date32 => make_scalar_function(|args| gen_range_date(args, false))(args), - Timestamp(_, _) => { - make_scalar_function(|args| gen_range_timestamp(args, false))(args) - } - dt => { - exec_err!("unsupported type for RANGE. Expected Int64, Date32 or Timestamp, got: {dt}") - } - } - } - - fn aliases(&self) -> &[String] { - &self.aliases - } - - fn documentation(&self) -> Option<&Documentation> { - self.doc() - } -} - -make_udf_expr_and_func!( - GenSeries, - gen_series, - start stop step, - "create a list of values in the range between start and stop, include upper bound", - gen_series_udf -); +struct RangeDoc {} #[user_doc( doc_section(label = "Array Functions"), description = "Similar to the range function, but it includes the upper bound.", - syntax_example = "generate_series(start, stop, step)", + syntax_example = "generate_series(stop) +generate_series(start, stop[, step])", sql_example = r#"```sql > select generate_series(1,3); +------------------------------------+ @@ -218,25 +130,50 @@ make_udf_expr_and_func!( description = "Increase by step (can not be 0). Steps less than a day are supported only for timestamp ranges." ) )] +struct GenerateSeriesDoc {} + #[derive(Debug, PartialEq, Eq, Hash)] -pub(super) struct GenSeries { +pub struct Range { signature: Signature, - aliases: Vec, + /// `false` for range, `true` for generate_series + include_upper_bound: bool, +} + +impl Default for Range { + fn default() -> Self { + Self::new() + } } -impl GenSeries { + +impl Range { + /// Generate `range()` function which excludes upper bound. pub fn new() -> Self { Self { signature: Signature::user_defined(Volatility::Immutable), - aliases: vec![], + include_upper_bound: false, + } + } + + /// Generate `generate_series()` function which includes upper bound. + fn generate_series() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + include_upper_bound: true, } } } -impl ScalarUDFImpl for GenSeries { + +impl ScalarUDFImpl for Range { fn as_any(&self) -> &dyn Any { self } + fn name(&self) -> &str { - "generate_series" + if self.include_upper_bound { + "generate_series" + } else { + "range" + } } fn signature(&self) -> &Signature { @@ -286,107 +223,263 @@ impl ScalarUDFImpl for GenSeries { let args = &args.args; if args.iter().any(|arg| arg.data_type().is_null()) { - return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1)))); + return Ok(ColumnarValue::Scalar(ScalarValue::Null)); } match args[0].data_type() { - Int64 => make_scalar_function(|args| gen_range_inner(args, true))(args), - Date32 => make_scalar_function(|args| gen_range_date(args, true))(args), + Int64 => make_scalar_function(|args| self.gen_range_inner(args))(args), + Date32 => make_scalar_function(|args| self.gen_range_date(args))(args), Timestamp(_, _) => { - make_scalar_function(|args| gen_range_timestamp(args, true))(args) + make_scalar_function(|args| self.gen_range_timestamp(args))(args) } dt => { - exec_err!( - "unsupported type for GENERATE_SERIES. Expected Int64, Date32 or Timestamp, got: {}", - dt - ) + exec_err!("unsupported type for {}. Expected Int64, Date32 or Timestamp, got: {dt}", self.name()) } } } - fn aliases(&self) -> &[String] { - &self.aliases - } - fn documentation(&self) -> Option<&Documentation> { - self.doc() + if self.include_upper_bound { + GenerateSeriesDoc {}.doc() + } else { + RangeDoc {}.doc() + } } } -/// Generates an array of integers from start to stop with a given step. -/// -/// This function takes 1 to 3 ArrayRefs as arguments, representing start, stop, and step values. -/// It returns a `Result` representing the resulting ListArray after the operation. -/// -/// # Arguments -/// -/// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and step(step value can not be zero.) values. -/// -/// # Examples -/// -/// gen_range(3) => [0, 1, 2] -/// gen_range(1, 4) => [1, 2, 3] -/// gen_range(1, 7, 2) => [1, 3, 5] -pub(super) fn gen_range_inner( - args: &[ArrayRef], - include_upper: bool, -) -> Result { - let (start_array, stop_array, step_array) = match args.len() { - 1 => (None, as_int64_array(&args[0])?, None), - 2 => ( - Some(as_int64_array(&args[0])?), - as_int64_array(&args[1])?, - None, - ), - 3 => ( - Some(as_int64_array(&args[0])?), - as_int64_array(&args[1])?, - Some(as_int64_array(&args[2])?), - ), - _ => return exec_err!("gen_range expects 1 to 3 arguments"), - }; - - let mut values = vec![]; - let mut offsets = vec![0]; - let mut valid = NullBufferBuilder::new(stop_array.len()); - for (idx, stop) in stop_array.iter().enumerate() { - match retrieve_range_args(start_array, stop, step_array, idx) { - Some((_, _, 0)) => { - return exec_err!( - "step can't be 0 for function {}(start [, stop, step])", - if include_upper { - "generate_series" - } else { - "range" - } - ); +impl Range { + /// Generates an array of integers from start to stop with a given step. + /// + /// This function takes 1 to 3 ArrayRefs as arguments, representing start, stop, and step values. + /// It returns a `Result` representing the resulting ListArray after the operation. + /// + /// # Arguments + /// + /// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and step(step value can not be zero.) values. + /// + /// # Examples + /// + /// gen_range(3) => [0, 1, 2] + /// gen_range(1, 4) => [1, 2, 3] + /// gen_range(1, 7, 2) => [1, 3, 5] + fn gen_range_inner(&self, args: &[ArrayRef]) -> Result { + let (start_array, stop_array, step_array) = match args { + [stop_array] => (None, as_int64_array(stop_array)?, None), + [start_array, stop_array] => ( + Some(as_int64_array(start_array)?), + as_int64_array(stop_array)?, + None, + ), + [start_array, stop_array, step_array] => ( + Some(as_int64_array(start_array)?), + as_int64_array(stop_array)?, + Some(as_int64_array(step_array)?), + ), + _ => return exec_err!("{} expects 1 to 3 arguments", self.name()), + }; + + let mut values = vec![]; + let mut offsets = vec![0]; + let mut valid = NullBufferBuilder::new(stop_array.len()); + for (idx, stop) in stop_array.iter().enumerate() { + match retrieve_range_args(start_array, stop, step_array, idx) { + Some((_, _, 0)) => { + return exec_err!( + "step can't be 0 for function {}(start [, stop, step])", + self.name() + ); + } + Some((start, stop, step)) => { + // Below, we utilize `usize` to represent steps. + // On 32-bit targets, the absolute value of `i64` may fail to fit into `usize`. + let step_abs = + usize::try_from(step.unsigned_abs()).map_err(|_| { + not_impl_datafusion_err!("step {} can't fit into usize", step) + })?; + values.extend( + gen_range_iter(start, stop, step < 0, self.include_upper_bound) + .step_by(step_abs), + ); + offsets.push(values.len() as i32); + valid.append_non_null(); + } + // If any of the arguments is NULL, append a NULL value to the result. + None => { + offsets.push(values.len() as i32); + valid.append_null(); + } + }; + } + let arr = Arc::new(ListArray::try_new( + Arc::new(Field::new_list_field(Int64, true)), + OffsetBuffer::new(offsets.into()), + Arc::new(Int64Array::from(values)), + valid.finish(), + )?); + Ok(arr) + } + + fn gen_range_date(&self, args: &[ArrayRef]) -> Result { + let [start, stop, step] = take_function_args(self.name(), args)?; + + let (start_array, stop_array, step_array) = ( + as_date32_array(start)?, + as_date32_array(stop)?, + as_interval_mdn_array(step)?, + ); + + // values are date32s + let values_builder = Date32Builder::new(); + let mut list_builder = ListBuilder::new(values_builder); + + for idx in 0..stop_array.len() { + if start_array.is_null(idx) + || stop_array.is_null(idx) + || step_array.is_null(idx) + { + list_builder.append_null(); + continue; } - Some((start, stop, step)) => { - // Below, we utilize `usize` to represent steps. - // On 32-bit targets, the absolute value of `i64` may fail to fit into `usize`. - let step_abs = usize::try_from(step.unsigned_abs()).map_err(|_| { - not_impl_datafusion_err!("step {} can't fit into usize", step) - })?; - values.extend( - gen_range_iter(start, stop, step < 0, include_upper) - .step_by(step_abs), - ); - offsets.push(values.len() as i32); - valid.append_non_null(); + + let start = start_array.value(idx); + let stop = stop_array.value(idx); + let step = step_array.value(idx); + + let (months, days, _) = IntervalMonthDayNanoType::to_parts(step); + if months == 0 && days == 0 { + return exec_err!("Cannot generate date range less than 1 day."); + } + + let stop = if !self.include_upper_bound { + Date32Type::subtract_month_day_nano(stop, step) + } else { + stop + }; + + let neg = months < 0 || days < 0; + let mut new_date = start; + + let values = from_fn(|| { + if (neg && new_date < stop) || (!neg && new_date > stop) { + None + } else { + let current_date = new_date; + new_date = Date32Type::add_month_day_nano(new_date, step); + Some(Some(current_date)) + } + }); + + list_builder.append_value(values); + } + + let arr = Arc::new(list_builder.finish()); + + Ok(arr) + } + + fn gen_range_timestamp(&self, args: &[ArrayRef]) -> Result { + let [start, stop, step] = take_function_args(self.name(), args)?; + + // coerce_types fn should coerce all types to Timestamp(Nanosecond, tz) + // TODO: remove these map_err once the signature is robust enough to guard against this + let start_arr = as_timestamp_nanosecond_array(start).map_err(|_e| { + DataFusionError::Internal(format!( + "Unexpected argument type for {} : {}", + self.name(), + start.data_type() + )) + })?; + let stop_arr = as_timestamp_nanosecond_array(stop).map_err(|_e| { + DataFusionError::Internal(format!( + "Unexpected argument type for {} : {}", + self.name(), + stop.data_type() + )) + })?; + let step_arr = as_interval_mdn_array(step)?; + let start_tz = parse_tz(&start_arr.timezone())?; + let stop_tz = parse_tz(&stop_arr.timezone())?; + + // values are timestamps + let values_builder = start_arr + .timezone() + .map_or_else(TimestampNanosecondBuilder::new, |start_tz_str| { + TimestampNanosecondBuilder::new().with_timezone(start_tz_str) + }); + let mut list_builder = ListBuilder::new(values_builder); + + for idx in 0..start_arr.len() { + if start_arr.is_null(idx) || stop_arr.is_null(idx) || step_arr.is_null(idx) { + list_builder.append_null(); + continue; } - // If any of the arguments is NULL, append a NULL value to the result. - None => { - offsets.push(values.len() as i32); - valid.append_null(); + + let start = start_arr.value(idx); + let stop = stop_arr.value(idx); + let step = step_arr.value(idx); + + let (months, days, ns) = IntervalMonthDayNanoType::to_parts(step); + if months == 0 && days == 0 && ns == 0 { + return exec_err!("Interval argument to {} must not be 0", self.name()); } - }; + + let neg = TimestampNanosecondType::add_month_day_nano(start, step, start_tz) + .ok_or(exec_datafusion_err!( + "Cannot generate timestamp range where start + step overflows" + ))? + .cmp(&start) + == Ordering::Less; + + let stop_dt = + as_datetime_with_timezone::(stop, stop_tz) + .ok_or(exec_datafusion_err!( + "Cannot generate timestamp for stop: {}: {:?}", + stop, + stop_tz + ))?; + + let mut current = start; + let mut current_dt = + as_datetime_with_timezone::(current, start_tz) + .ok_or(exec_datafusion_err!( + "Cannot generate timestamp for start: {}: {:?}", + current, + start_tz + ))?; + + let values = from_fn(|| { + let generate_series_should_end = self.include_upper_bound + && ((neg && current_dt < stop_dt) || (!neg && current_dt > stop_dt)); + let range_should_end = !self.include_upper_bound + && ((neg && current_dt <= stop_dt) + || (!neg && current_dt >= stop_dt)); + if generate_series_should_end || range_should_end { + return None; + } + + let prev_current = current; + + if let Some(ts) = + TimestampNanosecondType::add_month_day_nano(current, step, start_tz) + { + current = ts; + current_dt = as_datetime_with_timezone::( + current, start_tz, + )?; + + Some(Some(prev_current)) + } else { + // we failed to parse the timestamp here so terminate the series + None + } + }); + + list_builder.append_value(values); + } + + let arr = Arc::new(list_builder.finish()); + + Ok(arr) } - let arr = Arc::new(ListArray::try_new( - Arc::new(Field::new_list_field(Int64, true)), - OffsetBuffer::new(offsets.into()), - Arc::new(Int64Array::from(values)), - valid.finish(), - )?); - Ok(arr) } /// Get the (start, stop, step) args for the range and generate_series function. @@ -436,201 +529,7 @@ fn gen_range_iter( } } -fn gen_range_date(args: &[ArrayRef], include_upper_bound: bool) -> Result { - let [start, stop, step] = take_function_args("range", args)?; - - let (start_array, stop_array, step_array) = ( - Some(as_date32_array(start)?), - as_date32_array(stop)?, - Some(as_interval_mdn_array(step)?), - ); - - // values are date32s - let values_builder = Date32Builder::new(); - let mut list_builder = ListBuilder::new(values_builder); - - for idx in 0..stop_array.len() { - if stop_array.is_null(idx) { - list_builder.append_null(); - continue; - } - let mut stop = stop_array.value(idx); - - let start = if let Some(start_array_values) = start_array { - if start_array_values.is_null(idx) { - list_builder.append_null(); - continue; - } - start_array_values.value(idx) - } else { - list_builder.append_null(); - continue; - }; - - let step = if let Some(step) = step_array { - if step.is_null(idx) { - list_builder.append_null(); - continue; - } - step.value(idx) - } else { - list_builder.append_null(); - continue; - }; - - let (months, days, _) = IntervalMonthDayNanoType::to_parts(step); - - if months == 0 && days == 0 { - return exec_err!("Cannot generate date range less than 1 day."); - } - - let neg = months < 0 || days < 0; - if !include_upper_bound { - stop = Date32Type::subtract_month_day_nano(stop, step); - } - let mut new_date = start; - - let values = from_fn(|| { - if (neg && new_date < stop) || (!neg && new_date > stop) { - None - } else { - let current_date = new_date; - new_date = Date32Type::add_month_day_nano(new_date, step); - Some(Some(current_date)) - } - }); - - list_builder.append_value(values); - } - - let arr = Arc::new(list_builder.finish()); - - Ok(arr) -} - -fn gen_range_timestamp(args: &[ArrayRef], include_upper_bound: bool) -> Result { - let func_name = if include_upper_bound { - "GENERATE_SERIES" - } else { - "RANGE" - }; - let [start, stop, step] = take_function_args(func_name, args)?; - - // coerce_types fn should coerce all types to Timestamp(Nanosecond, tz) - let (start_arr, start_tz_opt) = cast_timestamp_arg(start, include_upper_bound)?; - let (stop_arr, stop_tz_opt) = cast_timestamp_arg(stop, include_upper_bound)?; - let step_arr = as_interval_mdn_array(step)?; - let start_tz = parse_tz(start_tz_opt)?; - let stop_tz = parse_tz(stop_tz_opt)?; - - // values are timestamps - let values_builder = start_tz_opt - .clone() - .map_or_else(TimestampNanosecondBuilder::new, |start_tz_str| { - TimestampNanosecondBuilder::new().with_timezone(start_tz_str) - }); - let mut list_builder = ListBuilder::new(values_builder); - - for idx in 0..start_arr.len() { - if start_arr.is_null(idx) || stop_arr.is_null(idx) || step_arr.is_null(idx) { - list_builder.append_null(); - continue; - } - - let start = start_arr.value(idx); - let stop = stop_arr.value(idx); - let step = step_arr.value(idx); - - let (months, days, ns) = IntervalMonthDayNanoType::to_parts(step); - if months == 0 && days == 0 && ns == 0 { - return exec_err!( - "Interval argument to {} must not be 0", - if include_upper_bound { - "GENERATE_SERIES" - } else { - "RANGE" - } - ); - } - - let neg = TSNT::add_month_day_nano(start, step, start_tz) - .ok_or(exec_datafusion_err!( - "Cannot generate timestamp range where start + step overflows" - ))? - .cmp(&start) - == Ordering::Less; - - let stop_dt = as_datetime_with_timezone::(stop, stop_tz).ok_or( - exec_datafusion_err!( - "Cannot generate timestamp for stop: {}: {:?}", - stop, - stop_tz - ), - )?; - - let mut current = start; - let mut current_dt = as_datetime_with_timezone::(current, start_tz).ok_or( - exec_datafusion_err!( - "Cannot generate timestamp for start: {}: {:?}", - current, - start_tz - ), - )?; - - let values = from_fn(|| { - if (include_upper_bound - && ((neg && current_dt < stop_dt) || (!neg && current_dt > stop_dt))) - || (!include_upper_bound - && ((neg && current_dt <= stop_dt) - || (!neg && current_dt >= stop_dt))) - { - return None; - } - - let prev_current = current; - - if let Some(ts) = TSNT::add_month_day_nano(current, step, start_tz) { - current = ts; - current_dt = as_datetime_with_timezone::(current, start_tz)?; - - Some(Some(prev_current)) - } else { - // we failed to parse the timestamp here so terminate the series - None - } - }); - - list_builder.append_value(values); - } - - let arr = Arc::new(list_builder.finish()); - - Ok(arr) -} - -fn cast_timestamp_arg( - arg: &ArrayRef, - include_upper: bool, -) -> Result<(&TimestampNanosecondArray, &Option>)> { - match arg.data_type() { - Timestamp(Nanosecond, tz_opt) => { - Ok((as_timestamp_nanosecond_array(arg)?, tz_opt)) - } - _ => { - internal_err!( - "Unexpected argument type for {} : {}", - if include_upper { - "GENERATE_SERIES" - } else { - "RANGE" - }, - arg.data_type() - ) - } - } -} - -fn parse_tz(tz: &Option>) -> Result { +fn parse_tz(tz: &Option<&str>) -> Result { let tz = tz.as_ref().map_or_else(|| "+00", |s| s); Tz::from_str(tz) diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 29f0241c8862..144e3b757adf 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6054,7 +6054,7 @@ NULL NULL # array_has([], 1) -> 'false' (empty array should return false) # array_has(null, 1) -> 'null' (null array should return null) query ?T -SELECT column1, COALESCE(CAST(array_has(column1, column2) AS VARCHAR), 'null') +SELECT column1, COALESCE(CAST(array_has(column1, column2) AS VARCHAR), 'null') from array_has_table_empty; ---- [1, 3, 5] true @@ -6315,7 +6315,7 @@ true false false true false true false false NULL NULL false false false false NULL false -false false false NULL +false false false NULL query BBBB select array_has_all(make_array(1,2,3), []), @@ -7131,7 +7131,7 @@ select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, [2021-01-01T00:00:00-05:00, 2021-01-01T01:29:54.500-05:00, 2021-01-01T02:59:49-05:00, 2021-01-01T04:29:43.500-05:00, 2021-01-01T05:59:38-05:00] ## mixing types for timestamps is not supported -query error DataFusion error: Internal error: Unexpected argument type for GENERATE_SERIES : Date32 +query error DataFusion error: Internal error: Unexpected argument type for generate_series : Date32 select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), DATE '2021-01-02', INTERVAL '1' HOUR); @@ -7239,7 +7239,7 @@ query error DataFusion error: Execution error: step can't be 0 for function gene select generate_series(1, 1, 0); # Test generate_series with zero step -query error DataFusion error: Execution error: Interval argument to GENERATE_SERIES must not be 0 +query error DataFusion error: Execution error: Interval argument to generate_series must not be 0 select generate_series(TIMESTAMP '2000-01-02', TIMESTAMP '2000-01-01', INTERVAL '0' MINUTE); # Test generate_series with big steps @@ -8209,7 +8209,7 @@ select array_reverse(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array [3, 2, 1] [1] query ???? -select array_reverse(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), +select array_reverse(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_reverse(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')), array_reverse(arrow_cast(make_array(1, NULL, 3), 'FixedSizeList(3, Int64)')), array_reverse(arrow_cast(make_array(NULL, NULL, NULL), 'FixedSizeList(3, Int64)')); diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index f6a49c2f1763..da1982acebe9 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -4213,7 +4213,8 @@ flatten(array) Similar to the range function, but it includes the upper bound. ```sql -generate_series(start, stop, step) +generate_series(stop) +generate_series(start, stop[, step]) ``` #### Arguments @@ -4433,7 +4434,8 @@ _Alias of [make_array](#make_array)._ Returns an Arrow array between start and stop with step. The range start..end contains all values with start <= x < end. It is empty if start >= end. Step cannot be 0. ```sql -range(start, stop, step) +range(stop) +range(start, stop[, step]) ``` #### Arguments @@ -4453,11 +4455,11 @@ range(start, stop, step) +-----------------------------------+ > select range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH); -+--------------------------------------------------------------+ -| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) | -+--------------------------------------------------------------+ ++--------------------------------------------------------------------------+ +| range(DATE '1992-09-01', DATE '1993-03-01', INTERVAL '1' MONTH) | ++--------------------------------------------------------------------------+ | [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] | -+--------------------------------------------------------------+ ++--------------------------------------------------------------------------+ ``` ### `string_to_array` From 8eed1fd46bdb9cf512794481d1a48657599284e6 Mon Sep 17 00:00:00 2001 From: Marc Brinkmann Date: Tue, 28 Oct 2025 03:20:54 +0100 Subject: [PATCH 024/157] Enforce unique names for `is_set` on `first_value` and `last_value` (#18303) ## Which issue does this PR close? - Closes #18302 ## Rationale for this change As described in the issue, this is a low-effort QoL fix for now. ## What changes are included in this PR? Uses the existing function for naming fields to replace the hardcoded `"is_set"` with a field-dependent name. Example output: ``` Field { name: "first_value(records_partitioned.trace_id)[first_value]", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {}, }, Field { name: "first_value(records_partitioned.trace_id)[first_value_is_set]", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {}, }, Field { name: "first_value(records_partitioned.value)[first_value]", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {}, }, Field { name: "first_value(records_partitioned.value)[first_value_is_set]", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {}, }, ``` ## Are these changes tested? No tests have been added, hopefully it should be covered by existing changes. ## Are there any user-facing changes? There should not be any, I assume `is_set` is never user visible. --- datafusion/core/tests/dataframe/mod.rs | 4 ++-- .../functions-aggregate/src/first_last.rs | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 17d1695478a5..043f42b18c9f 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -6459,10 +6459,10 @@ async fn test_duplicate_state_fields_for_dfschema_construct() -> Result<()> { "ticker", "first_value(value)[first_value]", "timestamp@0", - "is_set", + "first_value(value)[first_value_is_set]", "last_value(value)[last_value]", "timestamp@0", - "is_set", + "last_value(value)[last_value_is_set]", ]; let binding = partial_agg.schema(); diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index 28755427c732..b2a40ff50bd7 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -166,7 +166,14 @@ impl AggregateUDFImpl for FirstValue { ) .into()]; fields.extend(args.ordering_fields.iter().cloned()); - fields.push(Field::new("is_set", DataType::Boolean, true).into()); + fields.push( + Field::new( + format_state_name(args.name, "first_value_is_set"), + DataType::Boolean, + true, + ) + .into(), + ); Ok(fields) } @@ -1087,7 +1094,14 @@ impl AggregateUDFImpl for LastValue { ) .into()]; fields.extend(args.ordering_fields.iter().cloned()); - fields.push(Field::new("is_set", DataType::Boolean, true).into()); + fields.push( + Field::new( + format_state_name(args.name, "last_value_is_set"), + DataType::Boolean, + true, + ) + .into(), + ); Ok(fields) } From 556125f8734297aba72c56ead2ecdc699aba17e9 Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Tue, 28 Oct 2025 13:21:52 +1100 Subject: [PATCH 025/157] fix: support float16 for `abs()` (#18304) ## Which issue does this PR close? N/A ## Rationale for this change Cover missing f16 type for `abs` ## What changes are included in this PR? Support `abs` on f16; also do some cleanup. ## Are these changes tested? Added SLT. ## Are there any user-facing changes? No. --- Cargo.lock | 1 + Cargo.toml | 1 + datafusion/datasource-avro/Cargo.toml | 2 +- datafusion/functions/Cargo.toml | 1 + datafusion/functions/src/math/abs.rs | 36 ++------ datafusion/sqllogictest/test_files/math.slt | 93 ++++++++++++--------- 6 files changed, 62 insertions(+), 72 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aaa75ecf3247..c6e28555769f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2265,6 +2265,7 @@ dependencies = [ "itertools 0.14.0", "log", "md-5", + "num-traits", "rand 0.9.2", "regex", "sha2", diff --git a/Cargo.toml b/Cargo.toml index 1cfb23bb183d..e48afb19ff73 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -159,6 +159,7 @@ indexmap = "2.12.0" insta = { version = "1.43.2", features = ["glob", "filters"] } itertools = "0.14" log = "^0.4" +num-traits = { version = "0.2" } object_store = { version = "0.12.4", default-features = false } parking_lot = "0.12" parquet = { version = "57.0.0", default-features = false, features = [ diff --git a/datafusion/datasource-avro/Cargo.toml b/datafusion/datasource-avro/Cargo.toml index e013e8a3d093..6bab899e7f97 100644 --- a/datafusion/datasource-avro/Cargo.toml +++ b/datafusion/datasource-avro/Cargo.toml @@ -41,7 +41,7 @@ datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-session = { workspace = true } futures = { workspace = true } -num-traits = { version = "0.2" } +num-traits = { workspace = true } object_store = { workspace = true } [dev-dependencies] diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 90331fbccaf0..1dbeee7159fd 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -78,6 +78,7 @@ hex = { version = "0.4", optional = true } itertools = { workspace = true } log = { workspace = true } md-5 = { version = "^0.10.0", optional = true } +num-traits = { workspace = true } rand = { workspace = true } regex = { workspace = true, optional = true } sha2 = { version = "^0.10.9", optional = true } diff --git a/datafusion/functions/src/math/abs.rs b/datafusion/functions/src/math/abs.rs index 040f13c01449..b3dc2b2eb6f8 100644 --- a/datafusion/functions/src/math/abs.rs +++ b/datafusion/functions/src/math/abs.rs @@ -22,7 +22,8 @@ use std::sync::Arc; use arrow::array::{ ArrayRef, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, - Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, + Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, + Int8Array, }; use arrow::datatypes::DataType; use arrow::error::ArrowError; @@ -34,6 +35,7 @@ use datafusion_expr::{ Volatility, }; use datafusion_macros::user_doc; +use num_traits::sign::Signed; type MathArrayFunction = fn(&ArrayRef) -> Result; @@ -81,6 +83,7 @@ macro_rules! make_decimal_abs_function { /// Return different implementations based on input datatype to reduce branches during execution fn create_abs_function(input_data_type: &DataType) -> Result { match input_data_type { + DataType::Float16 => Ok(make_abs_function!(Float16Array)), DataType::Float32 => Ok(make_abs_function!(Float32Array)), DataType::Float64 => Ok(make_abs_function!(Float64Array)), @@ -143,6 +146,7 @@ impl ScalarUDFImpl for AbsFunc { fn as_any(&self) -> &dyn Any { self } + fn name(&self) -> &str { "abs" } @@ -152,35 +156,7 @@ impl ScalarUDFImpl for AbsFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - match arg_types[0] { - DataType::Float32 => Ok(DataType::Float32), - DataType::Float64 => Ok(DataType::Float64), - DataType::Int8 => Ok(DataType::Int8), - DataType::Int16 => Ok(DataType::Int16), - DataType::Int32 => Ok(DataType::Int32), - DataType::Int64 => Ok(DataType::Int64), - DataType::Null => Ok(DataType::Null), - DataType::UInt8 => Ok(DataType::UInt8), - DataType::UInt16 => Ok(DataType::UInt16), - DataType::UInt32 => Ok(DataType::UInt32), - DataType::UInt64 => Ok(DataType::UInt64), - DataType::Decimal32(precision, scale) => { - Ok(DataType::Decimal32(precision, scale)) - } - DataType::Decimal64(precision, scale) => { - Ok(DataType::Decimal64(precision, scale)) - } - DataType::Decimal128(precision, scale) => { - Ok(DataType::Decimal128(precision, scale)) - } - DataType::Decimal256(precision, scale) => { - Ok(DataType::Decimal256(precision, scale)) - } - _ => not_impl_err!( - "Unsupported data type {} for function abs", - arg_types[0].to_string() - ), - } + Ok(arg_types[0].clone()) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt index e206aa16b8a9..1cb68b85b2bc 100644 --- a/datafusion/sqllogictest/test_files/math.slt +++ b/datafusion/sqllogictest/test_files/math.slt @@ -139,16 +139,16 @@ select abs(arrow_cast('-1.2', 'Utf8')); statement ok CREATE TABLE test_nullable_integer( - c1 TINYINT, - c2 SMALLINT, - c3 INT, - c4 BIGINT, - c5 TINYINT UNSIGNED, - c6 SMALLINT UNSIGNED, - c7 INT UNSIGNED, - c8 BIGINT UNSIGNED, + c1 TINYINT, + c2 SMALLINT, + c3 INT, + c4 BIGINT, + c5 TINYINT UNSIGNED, + c6 SMALLINT UNSIGNED, + c7 INT UNSIGNED, + c8 BIGINT UNSIGNED, dataset TEXT - ) + ) AS VALUES (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'nulls'), (0, 0, 0, 0, 0, 0, 0, 0, 'zeros'), @@ -237,7 +237,7 @@ SELECT c8%0 FROM test_nullable_integer # abs: return type query TTTTTTTT rowsort -select +select arrow_typeof(abs(c1)), arrow_typeof(abs(c2)), arrow_typeof(abs(c3)), arrow_typeof(abs(c4)), arrow_typeof(abs(c5)), arrow_typeof(abs(c6)), arrow_typeof(abs(c7)), arrow_typeof(abs(c8)) from test_nullable_integer limit 1 @@ -285,13 +285,13 @@ drop table test_nullable_integer statement ok CREATE TABLE test_non_nullable_integer( - c1 TINYINT NOT NULL, - c2 SMALLINT NOT NULL, - c3 INT NOT NULL, - c4 BIGINT NOT NULL, - c5 TINYINT UNSIGNED NOT NULL, - c6 SMALLINT UNSIGNED NOT NULL, - c7 INT UNSIGNED NOT NULL, + c1 TINYINT NOT NULL, + c2 SMALLINT NOT NULL, + c3 INT NOT NULL, + c4 BIGINT NOT NULL, + c5 TINYINT UNSIGNED NOT NULL, + c6 SMALLINT UNSIGNED NOT NULL, + c7 INT UNSIGNED NOT NULL, c8 BIGINT UNSIGNED NOT NULL ); @@ -363,7 +363,7 @@ CREATE TABLE test_nullable_float( c2 double ) AS VALUES (-1.0, -1.0), - (1.0, 1.0), + (1.0, 1.0), (NULL, NULL), (0., 0.), ('NaN'::double, 'NaN'::double); @@ -412,7 +412,7 @@ Float32 Float64 # abs: floats query RR rowsort -SELECT abs(c1), abs(c2) from test_nullable_float +SELECT abs(c1), abs(c2) from test_nullable_float ---- 0 0 1 1 @@ -420,6 +420,17 @@ SELECT abs(c1), abs(c2) from test_nullable_float NULL NULL NaN NaN +# f16 +query TR rowsort +SELECT arrow_typeof(abs(arrow_cast(c1, 'Float16'))), abs(arrow_cast(c1, 'Float16')) +FROM test_nullable_float +---- +Float16 0 +Float16 1 +Float16 1 +Float16 NULL +Float16 NaN + statement ok drop table test_nullable_float @@ -428,7 +439,7 @@ statement ok CREATE TABLE test_non_nullable_float( c1 float NOT NULL, c2 double NOT NULL - ); + ); query I INSERT INTO test_non_nullable_float VALUES @@ -478,27 +489,27 @@ drop table test_non_nullable_float statement ok CREATE TABLE test_nullable_decimal( c1 DECIMAL(10, 2), /* Decimal128 */ - c2 DECIMAL(38, 10), /* Decimal128 with max precision */ + c2 DECIMAL(38, 10), /* Decimal128 with max precision */ c3 DECIMAL(40, 2), /* Decimal256 */ - c4 DECIMAL(76, 10) /* Decimal256 with max precision */ - ) AS VALUES - (0, 0, 0, 0), + c4 DECIMAL(76, 10) /* Decimal256 with max precision */ + ) AS VALUES + (0, 0, 0, 0), (NULL, NULL, NULL, NULL); query I INSERT into test_nullable_decimal values ( - -99999999.99, - '-9999999999999999999999999999.9999999999', - '-99999999999999999999999999999999999999.99', + -99999999.99, + '-9999999999999999999999999999.9999999999', + '-99999999999999999999999999999999999999.99', '-999999999999999999999999999999999999999999999999999999999999999999.9999999999' - ), + ), ( - 99999999.99, - '9999999999999999999999999999.9999999999', - '99999999999999999999999999999999999999.99', + 99999999.99, + '9999999999999999999999999999.9999999999', + '99999999999999999999999999999999999999.99', '999999999999999999999999999999999999999999999999999999999999999999.9999999999' - ) + ) ---- 2 @@ -533,9 +544,9 @@ SELECT c1%0 FROM test_nullable_decimal WHERE c1 IS NOT NULL; # abs: return type query TTTT -SELECT - arrow_typeof(abs(c1)), - arrow_typeof(abs(c2)), +SELECT + arrow_typeof(abs(c1)), + arrow_typeof(abs(c2)), arrow_typeof(abs(c3)), arrow_typeof(abs(c4)) FROM test_nullable_decimal limit 1 @@ -552,11 +563,11 @@ SELECT abs(c1), abs(c2), abs(c3), abs(c4) FROM test_nullable_decimal NULL NULL NULL NULL statement ok -drop table test_nullable_decimal +drop table test_nullable_decimal statement ok -CREATE TABLE test_non_nullable_decimal(c1 DECIMAL(9,2) NOT NULL); +CREATE TABLE test_non_nullable_decimal(c1 DECIMAL(9,2) NOT NULL); query I INSERT INTO test_non_nullable_decimal VALUES(1) @@ -569,13 +580,13 @@ SELECT c1*0 FROM test_non_nullable_decimal 0 query error DataFusion error: Arrow error: Divide by zero error -SELECT c1/0 FROM test_non_nullable_decimal +SELECT c1/0 FROM test_non_nullable_decimal query error DataFusion error: Arrow error: Divide by zero error -SELECT c1%0 FROM test_non_nullable_decimal +SELECT c1%0 FROM test_non_nullable_decimal statement ok -drop table test_non_nullable_decimal +drop table test_non_nullable_decimal statement ok CREATE TABLE signed_integers( @@ -615,7 +626,7 @@ NULL NULL NULL # scalar maxes and/or negative 1 query III -select +select gcd(9223372036854775807, -9223372036854775808), -- i64::MAX, i64::MIN gcd(9223372036854775807, -1), -- i64::MAX, -1 gcd(-9223372036854775808, -1); -- i64::MIN, -1 From a4da700aba2907542181591318425a1a803c9504 Mon Sep 17 00:00:00 2001 From: Artem Medvedev Date: Mon, 27 Oct 2025 21:24:19 -0500 Subject: [PATCH 026/157] chore(deps): update testcontainers to `0.25.2` and drop ignore of `RUSTSEC-2025-0111` (#18305) ## Which issue does this PR close? Follow up to #18288 ## Rationale for this change Updates `testcontainers` in order to avoid `RUSTSEC-2025-0111` ignore --- .github/workflows/audit.yml | 7 +- Cargo.lock | 204 +++++++++++++++++++++++++++++------- Cargo.toml | 4 +- 3 files changed, 171 insertions(+), 44 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 3685bb2f9a78..a77ca501976f 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -46,9 +46,4 @@ jobs: with: tool: cargo-audit - name: Run audit check - # RUSTSEC-2025-0111: tokio-tar is by testcontainers for orchestration - # of testing, so does not impact DataFusion's security - # See https://github.com/apache/datafusion/issues/18288 - # NOTE: can remove this once testcontainers releases a version that includes - # https://github.com/testcontainers/testcontainers-rs/pull/852 - run: cargo audit --ignore RUSTSEC-2025-0111 + run: cargo audit diff --git a/Cargo.lock b/Cargo.lock index c6e28555769f..120dc29db223 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -502,6 +502,22 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "astral-tokio-tar" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec179a06c1769b1e42e1e2cbe74c7dcdb3d6383c838454d063eaac5bbb7ebbe5" +dependencies = [ + "filetime", + "futures-core", + "libc", + "portable-atomic", + "rustc-hash", + "tokio", + "tokio-stream", + "xattr", +] + [[package]] name = "async-compression" version = "0.4.19" @@ -539,6 +555,28 @@ dependencies = [ "syn 2.0.108", ] +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.108", +] + [[package]] name = "async-trait" version = "0.1.89" @@ -1077,13 +1115,17 @@ dependencies = [ [[package]] name = "bollard" -version = "0.18.1" +version = "0.19.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97ccca1260af6a459d75994ad5acc1651bcabcbdbc41467cc9786519ab854c30" +checksum = "ec7646ee90964aa59e9f832a67182791396a19a5b1d76eb17599a8310a7e2e09" dependencies = [ + "async-stream", "base64 0.22.1", + "bitflags 2.9.4", + "bollard-buildkit-proto", "bollard-stubs", "bytes", + "chrono", "futures-core", "futures-util", "hex", @@ -1096,7 +1138,9 @@ dependencies = [ "hyper-util", "hyperlocal", "log", + "num", "pin-project-lite", + "rand 0.9.2", "rustls", "rustls-native-certs", "rustls-pemfile", @@ -1108,19 +1152,40 @@ dependencies = [ "serde_urlencoded", "thiserror", "tokio", + "tokio-stream", "tokio-util", + "tonic", "tower-service", "url", "winapi", ] +[[package]] +name = "bollard-buildkit-proto" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85a885520bf6249ab931a764ffdb87b0ceef48e6e7d807cfdb21b751e086e1ad" +dependencies = [ + "prost", + "prost-types", + "tonic", + "tonic-prost", + "ureq", +] + [[package]] name = "bollard-stubs" -version = "1.47.1-rc.27.3.1" +version = "1.49.1-rc.28.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f179cfbddb6e77a5472703d4b30436bff32929c0aa8a9008ecf23d1d3cdd0da" +checksum = "5731fe885755e92beff1950774068e0cae67ea6ec7587381536fca84f1779623" dependencies = [ + "base64 0.22.1", + "bollard-buildkit-proto", + "bytes", + "chrono", + "prost", "serde", + "serde_json", "serde_repr", "serde_with", ] @@ -3977,7 +4042,7 @@ checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" dependencies = [ "bitflags 2.9.4", "libc", - "redox_syscall 0.5.17", + "redox_syscall", ] [[package]] @@ -4193,6 +4258,20 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -4228,6 +4307,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -4363,7 +4464,7 @@ checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.5.17", + "redox_syscall", "smallvec", "windows-targets 0.52.6", ] @@ -5111,15 +5212,6 @@ dependencies = [ "syn 2.0.108", ] -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.5.17" @@ -5407,6 +5499,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd3c25631629d034ce7cd9940adc9d45762d46de2b0f57193c4443b92c6d4d40" dependencies = [ "aws-lc-rs", + "log", "once_cell", "ring", "rustls-pki-types", @@ -6172,13 +6265,13 @@ dependencies = [ [[package]] name = "testcontainers" -version = "0.24.0" +version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23bb7577dca13ad86a78e8271ef5d322f37229ec83b8d98da6d996c588a1ddb1" +checksum = "3f3ac71069f20ecfa60c396316c283fbf35e6833a53dff551a31b5458da05edc" dependencies = [ + "astral-tokio-tar", "async-trait", "bollard", - "bollard-stubs", "bytes", "docker_credential", "either", @@ -6194,16 +6287,16 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", - "tokio-tar", "tokio-util", + "ulid", "url", ] [[package]] name = "testcontainers-modules" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac95cde96549fc19c6bf19ef34cc42bd56e264c1cb97e700e21555be0ecf9e2" +checksum = "1966329d5bb3f89d33602d2db2da971fb839f9297dad16527abf4564e2ae0a6d" dependencies = [ "testcontainers", ] @@ -6407,21 +6500,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-tar" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75" -dependencies = [ - "filetime", - "futures-core", - "libc", - "redox_syscall 0.3.5", - "tokio", - "tokio-stream", - "xattr", -] - [[package]] name = "tokio-util" version = "0.7.16" @@ -6703,6 +6781,16 @@ dependencies = [ "typify-impl", ] +[[package]] +name = "ulid" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "470dbf6591da1b39d43c14523b2b469c86879a53e8b758c8e090a470fe7b1fbe" +dependencies = [ + "rand 0.9.2", + "web-time", +] + [[package]] name = "unicode-bidi" version = "0.3.18" @@ -6772,6 +6860,35 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "ureq" +version = "3.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99ba1025f18a4a3fc3e9b48c868e9beb4f24f4b4b1a325bada26bd4119f46537" +dependencies = [ + "base64 0.22.1", + "log", + "percent-encoding", + "rustls", + "rustls-pemfile", + "rustls-pki-types", + "ureq-proto", + "utf-8", + "webpki-roots", +] + +[[package]] +name = "ureq-proto" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b4531c118335662134346048ddb0e54cc86bd7e81866757873055f0e38f5d2" +dependencies = [ + "base64 0.22.1", + "http 1.3.1", + "httparse", + "log", +] + [[package]] name = "url" version = "2.5.7" @@ -6790,6 +6907,12 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -7001,6 +7124,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-roots" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b130c0d2d49f8b6889abc456e795e82525204f27c42cf767cf0d7734e089b8" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "whoami" version = "1.6.1" diff --git a/Cargo.toml b/Cargo.toml index e48afb19ff73..bf0f3fa0510e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -178,8 +178,8 @@ rstest = "0.25.0" serde_json = "1" sqlparser = { version = "0.59.0", default-features = false, features = ["std", "visitor"] } tempfile = "3" -testcontainers = { version = "0.24", features = ["default"] } -testcontainers-modules = { version = "0.12" } +testcontainers = { version = "0.25.2", features = ["default"] } +testcontainers-modules = { version = "0.13" } tokio = { version = "1.48", features = ["macros", "rt", "sync"] } url = "2.5.7" From 1f14fa34abb0a4b471d1d2b58cb28c3c06c22c6f Mon Sep 17 00:00:00 2001 From: Samuele Resca Date: Tue, 28 Oct 2025 06:36:39 +0000 Subject: [PATCH 027/157] Using `try_append_value` from arrow-rs 57.0.0 (#18313) ## Which issue does this PR close? Avoid panic described #17857 by using `try_append_value` ## Rationale for this change Avoid panic described in #17857. ## Are these changes tested? Code is already covered by tests ## Are there any user-facing changes? No --- .../physical-expr/src/expressions/binary/kernels.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/binary/kernels.rs b/datafusion/physical-expr/src/expressions/binary/kernels.rs index 36ecd1c81619..ff833c17cdcb 100644 --- a/datafusion/physical-expr/src/expressions/binary/kernels.rs +++ b/datafusion/physical-expr/src/expressions/binary/kernels.rs @@ -166,7 +166,7 @@ pub fn concat_elements_utf8view( buffer.clear(); write!(&mut buffer, "{left}{right}") .expect("writing into string buffer failed"); - result.append_value(&buffer); + result.try_append_value(&buffer)?; } else { // at least one of the values is null, so the output is also null result.append_null() @@ -260,13 +260,13 @@ pub(crate) fn regex_match_dyn_scalar( let result: Result = match left.data_type() { DataType::Utf8 => { regexp_is_match_flag_scalar!(left, right, StringArray, not_match, flag) - }, + } DataType::Utf8View => { regexp_is_match_flag_scalar!(left, right, StringViewArray, not_match, flag) } DataType::LargeUtf8 => { regexp_is_match_flag_scalar!(left, right, LargeStringArray, not_match, flag) - }, + } DataType::Dictionary(_, _) => { let values = left.as_any_dictionary().values(); @@ -288,7 +288,7 @@ pub(crate) fn regex_match_dyn_scalar( _ => unreachable!(), } ) - }, + } other => internal_err!( "Data type {} not supported for operation 'regex_match_dyn_scalar' on string array", other From 28fb15a5f6a76e4cee9f6d26d0a24eb7ab3fd940 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Tue, 28 Oct 2025 14:48:19 +0800 Subject: [PATCH 028/157] feat: Introduce `PruningMetrics` and use it in parquet file pruning metric (#18297) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? part of https://github.com/apache/datafusion/issues/18195 ## Rationale for this change Make pruning related metrics display nicer. Before: `metrics=[...files_ranges_matched_statistics=3, files_ranges_pruned_statistics=7...]` PR: `metrics=[...files_ranges_pruned_statistics=10 total → 3 matched...]` ### Demo with `datafusion-cli` ``` CREATE EXTERNAL TABLE IF NOT EXISTS lineitem STORED AS parquet LOCATION '/Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem'; set datafusion.explain.analyze_level = summary; explain analyze select * from lineitem where l_orderkey = 3000000; +-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | plan_type | plan | +-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Plan with Metrics | CoalesceBatchesExec: target_batch_size=8192, metrics=[output_rows=5, elapsed_compute=384.635µs, output_bytes=1092.0 B] | | | FilterExec: l_orderkey@0 = 3000000, metrics=[output_rows=5, elapsed_compute=1.303305ms, output_bytes=530.8 KB] | | | DataSourceExec: file_groups={14 groups: [[Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-0.parquet:0..11525426], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-0.parquet:11525426..20311205, Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-1.parquet:0..2739647], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-1.parquet:2739647..14265073], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-1.parquet:14265073..20193593, Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-2.parquet:0..5596906], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-2.parquet:5596906..17122332], ...]}, projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment], file_type=parquet, predicate=l_orderkey@0 = 3000000, pruning_predicate=l_orderkey_null_count@2 != row_count@3 AND l_orderkey_min@0 <= 3000000 AND 3000000 <= l_orderkey_max@1, required_guarantees=[l_orderkey in (3000000)], metrics=[output_rows=19813, elapsed_compute=14ns, output_bytes=5.7 MB, files_ranges_pruned_statistics=21 total → 3 matched, bytes_scanned=2147308, page_index_rows_matched=19813, page_index_rows_pruned=729088, row_groups_matched_bloom_filter=0, row_groups_matched_statistics=1, row_groups_pruned_bloom_filter=0, row_groups_pruned_statistics=0, metadata_load_time=1.167622ms] | | | | +-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 1 row(s) fetched. Elapsed 0.051 seconds. ``` ## What changes are included in this PR? 1. Introduce `PruningMetrics` metrics type 2. Update `files_ranges_pruned_metrics` with this new metric type. Note this is applicable to other 6 metrics for different row group/page level pruning in parquet scanner, they're not included here to keep this PR easier to review. ## Are these changes tested? UT ## Are there any user-facing changes? No --- datafusion/core/tests/parquet/mod.rs | 25 +++- datafusion/core/tests/sql/explain_analyze.rs | 31 +++++ datafusion/datasource-parquet/src/metrics.rs | 8 +- datafusion/datasource-parquet/src/opener.rs | 16 ++- .../physical-plan/src/metrics/builder.rs | 18 ++- datafusion/physical-plan/src/metrics/mod.rs | 5 +- datafusion/physical-plan/src/metrics/value.rs | 125 ++++++++++++++++-- 7 files changed, 205 insertions(+), 23 deletions(-) diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index c44d14abd381..34a48cdae374 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -37,6 +37,7 @@ use datafusion::{ prelude::{ParquetReadOptions, SessionConfig, SessionContext}, }; use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder}; +use datafusion_physical_plan::metrics::MetricValue; use parquet::arrow::ArrowWriter; use parquet::file::properties::{EnabledStatistics, WriterProperties}; use std::sync::Arc; @@ -155,8 +156,30 @@ impl TestOutput { self.metric_value("row_groups_pruned_statistics") } + /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count, + /// for testing purpose, here it only aggregate the `pruned` count. fn files_ranges_pruned_statistics(&self) -> Option { - self.metric_value("files_ranges_pruned_statistics") + let mut total_pruned = 0; + let mut found = false; + + for metric in self.parquet_metrics.iter() { + let metric = metric.as_ref(); + if metric.value().name() == "files_ranges_pruned_statistics" { + if let MetricValue::PruningMetrics { + pruning_metrics, .. + } = metric.value() + { + total_pruned += pruning_metrics.pruned(); + found = true; + } + } + } + + if found { + Some(total_pruned) + } else { + None + } } /// The number of row_groups matched by bloom filter or statistics diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 43f79ead0257..a7cc30a9484c 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -257,6 +257,37 @@ async fn explain_analyze_level_datasource_parquet() { } } +#[tokio::test] +async fn explain_analyze_parquet_pruning_metrics() { + let table_name = "tpch_lineitem_small"; + let parquet_path = "tests/data/tpch_lineitem_small.parquet"; + let ctx = SessionContext::new(); + ctx.register_parquet(table_name, parquet_path, ParquetReadOptions::default()) + .await + .expect("register parquet table for explain analyze test"); + + // Test scenario: + // This table's l_orderkey has range [1, 7] + // So the following query can't prune the file: + // select * from tpch_lineitem_small where l_orderkey = 5; + // If change filter to `l_orderkey=10`, the whole file can be pruned using stat. + for (l_orderkey, expected_pruning_metrics) in + [(5, "1 total → 1 matched"), (10, "1 total → 0 matched")] + { + let sql = format!( + "explain analyze select * from {table_name} where l_orderkey = {l_orderkey};" + ); + + let plan = + collect_plan_with_context(&sql, &ctx, ExplainAnalyzeLevel::Summary).await; + + let expected_metrics = + format!("files_ranges_pruned_statistics={expected_pruning_metrics}"); + + assert_metrics!(&plan, "DataSourceExec", &expected_metrics); + } +} + #[tokio::test] async fn csv_explain_plans() { // This test verify the look of each plan in its full cycle plan creation diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs index 5f17fbb4b9ee..9d86a3ae9f2d 100644 --- a/datafusion/datasource-parquet/src/metrics.rs +++ b/datafusion/datasource-parquet/src/metrics.rs @@ -16,7 +16,7 @@ // under the License. use datafusion_physical_plan::metrics::{ - Count, ExecutionPlanMetricsSet, MetricBuilder, MetricType, Time, + Count, ExecutionPlanMetricsSet, MetricBuilder, MetricType, PruningMetrics, Time, }; /// Stores metrics about the parquet execution for a particular parquet file. @@ -27,7 +27,7 @@ use datafusion_physical_plan::metrics::{ /// [`ParquetFileReaderFactory`]: super::ParquetFileReaderFactory #[derive(Debug, Clone)] pub struct ParquetFileMetrics { - /// Number of file **ranges** pruned by partition or file level statistics. + /// Number of file **ranges** pruned or matched by partition or file level statistics. /// Pruning of files often happens at planning time but may happen at execution time /// if dynamic filters (e.g. from a join) result in additional pruning. /// @@ -41,7 +41,7 @@ pub struct ParquetFileMetrics { /// pushdown optimization may fill up the TopK heap when reading the first part of a file, /// then skip the second part if file statistics indicate it cannot contain rows /// that would be in the TopK. - pub files_ranges_pruned_statistics: Count, + pub files_ranges_pruned_statistics: PruningMetrics, /// Number of times the predicate could not be evaluated pub predicate_evaluation_errors: Count, /// Number of row groups whose bloom filters were checked and matched (not pruned) @@ -132,7 +132,7 @@ impl ParquetFileMetrics { let files_ranges_pruned_statistics = MetricBuilder::new(metrics) .with_type(MetricType::SUMMARY) - .counter("files_ranges_pruned_statistics", partition); + .pruning_metrics("files_ranges_pruned_statistics", partition); // ----------------------- // 'dev' level metrics diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index af7a537ca6f4..1c9b9feb9f50 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -40,7 +40,9 @@ use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory; use datafusion_physical_expr_common::physical_expr::{ is_dynamic_physical_expr, PhysicalExpr, }; -use datafusion_physical_plan::metrics::{Count, ExecutionPlanMetricsSet, MetricBuilder}; +use datafusion_physical_plan::metrics::{ + Count, ExecutionPlanMetricsSet, MetricBuilder, PruningMetrics, +}; use datafusion_pruning::{build_pruning_predicate, FilePruner, PruningPredicate}; #[cfg(feature = "parquet_encryption")] @@ -195,11 +197,13 @@ impl FileOpener for ParquetOpener { if let Some(file_pruner) = &mut file_pruner { if file_pruner.should_prune()? { // Return an empty stream immediately to skip the work of setting up the actual stream - file_metrics.files_ranges_pruned_statistics.add(1); + file_metrics.files_ranges_pruned_statistics.add_pruned(1); return Ok(futures::stream::empty().boxed()); } } + file_metrics.files_ranges_pruned_statistics.add_matched(1); + // Don't load the page index yet. Since it is not stored inline in // the footer, loading the page index if it is not needed will do // unnecessary I/O. We decide later if it is needed to evaluate the @@ -480,7 +484,7 @@ struct EarlyStoppingStream { /// None done: bool, file_pruner: FilePruner, - files_ranges_pruned_statistics: Count, + files_ranges_pruned_statistics: PruningMetrics, /// The inner stream inner: S, } @@ -489,7 +493,7 @@ impl EarlyStoppingStream { pub fn new( stream: S, file_pruner: FilePruner, - files_ranges_pruned_statistics: Count, + files_ranges_pruned_statistics: PruningMetrics, ) -> Self { Self { done: false, @@ -509,7 +513,9 @@ where // Since dynamic filters may have been updated, see if we can stop // reading this stream entirely. if self.file_pruner.should_prune()? { - self.files_ranges_pruned_statistics.add(1); + self.files_ranges_pruned_statistics.add_pruned(1); + // Previously this file range has been counted as matched + self.files_ranges_pruned_statistics.subtract_matched(1); self.done = true; Ok(None) } else { diff --git a/datafusion/physical-plan/src/metrics/builder.rs b/datafusion/physical-plan/src/metrics/builder.rs index 88ec1a3f67d1..bf59dccf6625 100644 --- a/datafusion/physical-plan/src/metrics/builder.rs +++ b/datafusion/physical-plan/src/metrics/builder.rs @@ -19,7 +19,7 @@ use std::{borrow::Cow, sync::Arc}; -use crate::metrics::MetricType; +use crate::metrics::{value::PruningMetrics, MetricType}; use super::{ Count, ExecutionPlanMetricsSet, Gauge, Label, Metric, MetricValue, Time, Timestamp, @@ -250,4 +250,20 @@ impl<'a> MetricBuilder<'a> { .build(MetricValue::EndTimestamp(timestamp.clone())); timestamp } + + /// Consumes self and creates a new `PruningMetrics` + pub fn pruning_metrics( + self, + name: impl Into>, + partition: usize, + ) -> PruningMetrics { + let pruning_metrics = PruningMetrics::new(); + self.with_partition(partition) + .build(MetricValue::PruningMetrics { + name: name.into(), + // inner values will be `Arc::clone()` + pruning_metrics: pruning_metrics.clone(), + }); + pruning_metrics + } } diff --git a/datafusion/physical-plan/src/metrics/mod.rs b/datafusion/physical-plan/src/metrics/mod.rs index 02aad6eb60ac..e66db8f0c911 100644 --- a/datafusion/physical-plan/src/metrics/mod.rs +++ b/datafusion/physical-plan/src/metrics/mod.rs @@ -35,7 +35,9 @@ use datafusion_common::HashMap; pub use baseline::{BaselineMetrics, RecordOutput, SpillMetrics, SplitMetrics}; pub use builder::MetricBuilder; pub use custom::CustomMetricValue; -pub use value::{Count, Gauge, MetricValue, ScopedTimerGuard, Time, Timestamp}; +pub use value::{ + Count, Gauge, MetricValue, PruningMetrics, ScopedTimerGuard, Time, Timestamp, +}; /// Something that tracks a value of interest (metric) of a DataFusion /// [`ExecutionPlan`] execution. @@ -302,6 +304,7 @@ impl MetricsSet { MetricValue::Gauge { name, .. } => name == metric_name, MetricValue::StartTimestamp(_) => false, MetricValue::EndTimestamp(_) => false, + MetricValue::PruningMetrics { .. } => false, MetricValue::Custom { .. } => false, }) } diff --git a/datafusion/physical-plan/src/metrics/value.rs b/datafusion/physical-plan/src/metrics/value.rs index fc947935503c..3b8aa7a2bd34 100644 --- a/datafusion/physical-plan/src/metrics/value.rs +++ b/datafusion/physical-plan/src/metrics/value.rs @@ -362,6 +362,74 @@ impl Drop for ScopedTimerGuard<'_> { } } +/// Counters tracking pruning metrics +/// +/// For example, a file scanner initially is planned to scan 10 files, but skipped +/// 8 of them using statistics, the pruning metrics would look like: 10 total -> 2 matched +/// +/// Note `clone`ing update the same underlying metrics +#[derive(Debug, Clone)] +pub struct PruningMetrics { + pruned: Arc, + matched: Arc, +} + +impl Display for PruningMetrics { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let matched = self.matched.load(Ordering::Relaxed); + let total = self.pruned.load(Ordering::Relaxed) + matched; + + write!(f, "{total} total → {matched} matched") + } +} + +impl Default for PruningMetrics { + fn default() -> Self { + Self::new() + } +} + +impl PruningMetrics { + /// create a new PruningMetrics + pub fn new() -> Self { + Self { + pruned: Arc::new(AtomicUsize::new(0)), + matched: Arc::new(AtomicUsize::new(0)), + } + } + + /// Add `n` to the metric's pruned value + pub fn add_pruned(&self, n: usize) { + // relaxed ordering for operations on `value` poses no issues + // we're purely using atomic ops with no associated memory ops + self.pruned.fetch_add(n, Ordering::Relaxed); + } + + /// Add `n` to the metric's matched value + pub fn add_matched(&self, n: usize) { + // relaxed ordering for operations on `value` poses no issues + // we're purely using atomic ops with no associated memory ops + self.matched.fetch_add(n, Ordering::Relaxed); + } + + /// Subtract `n` to the metric's matched value. + pub fn subtract_matched(&self, n: usize) { + // relaxed ordering for operations on `value` poses no issues + // we're purely using atomic ops with no associated memory ops + self.matched.fetch_sub(n, Ordering::Relaxed); + } + + /// Number of items pruned + pub fn pruned(&self) -> usize { + self.pruned.load(Ordering::Relaxed) + } + + /// Number of items matched (not pruned) + pub fn matched(&self) -> usize { + self.matched.load(Ordering::Relaxed) + } +} + /// Possible values for a [super::Metric]. /// /// Among other differences, the metric types have different ways to @@ -426,6 +494,11 @@ pub enum MetricValue { StartTimestamp(Timestamp), /// The time at which execution ended EndTimestamp(Timestamp), + /// Metrics related to scan pruning + PruningMetrics { + name: Cow<'static, str>, + pruning_metrics: PruningMetrics, + }, Custom { /// The provided name of this metric name: Cow<'static, str>, @@ -519,11 +592,13 @@ impl MetricValue { Self::Time { name, .. } => name.borrow(), Self::StartTimestamp(_) => "start_timestamp", Self::EndTimestamp(_) => "end_timestamp", + Self::PruningMetrics { name, .. } => name.borrow(), Self::Custom { name, .. } => name.borrow(), } } - /// Return the value of the metric as a usize value + /// Return the value of the metric as a usize value, used to aggregate metric + /// value across partitions. pub fn as_usize(&self) -> usize { match self { Self::OutputRows(count) => count.value(), @@ -546,6 +621,10 @@ impl MetricValue { .and_then(|ts| ts.timestamp_nanos_opt()) .map(|nanos| nanos as usize) .unwrap_or(0), + // This function is a utility for aggregating metrics, for complex metric + // like `PruningMetrics`, this function is not supposed to get called. + // Metrics aggregation for them are implemented inside `MetricsSet` directly. + Self::PruningMetrics { .. } => 0, Self::Custom { value, .. } => value.as_usize(), } } @@ -575,6 +654,10 @@ impl MetricValue { }, Self::StartTimestamp(_) => Self::StartTimestamp(Timestamp::new()), Self::EndTimestamp(_) => Self::EndTimestamp(Timestamp::new()), + Self::PruningMetrics { name, .. } => Self::PruningMetrics { + name: name.clone(), + pruning_metrics: PruningMetrics::new(), + }, Self::Custom { name, value } => Self::Custom { name: name.clone(), value: value.new_empty(), @@ -626,6 +709,20 @@ impl MetricValue { (Self::EndTimestamp(timestamp), Self::EndTimestamp(other_timestamp)) => { timestamp.update_to_max(other_timestamp); } + ( + Self::PruningMetrics { + pruning_metrics, .. + }, + Self::PruningMetrics { + pruning_metrics: other_pruning_metrics, + .. + }, + ) => { + let pruned = other_pruning_metrics.pruned.load(Ordering::Relaxed); + let matched = other_pruning_metrics.matched.load(Ordering::Relaxed); + pruning_metrics.add_pruned(pruned); + pruning_metrics.add_matched(matched); + } ( Self::Custom { value, .. }, Self::Custom { @@ -652,16 +749,17 @@ impl MetricValue { Self::ElapsedCompute(_) => 1, Self::OutputBytes(_) => 2, // Other metrics - Self::SpillCount(_) => 3, - Self::SpilledBytes(_) => 4, - Self::SpilledRows(_) => 5, - Self::CurrentMemoryUsage(_) => 6, - Self::Count { .. } => 7, - Self::Gauge { .. } => 8, - Self::Time { .. } => 9, - Self::StartTimestamp(_) => 10, // show timestamps last - Self::EndTimestamp(_) => 11, - Self::Custom { .. } => 12, + Self::PruningMetrics { .. } => 3, + Self::SpillCount(_) => 4, + Self::SpilledBytes(_) => 5, + Self::SpilledRows(_) => 6, + Self::CurrentMemoryUsage(_) => 7, + Self::Count { .. } => 8, + Self::Gauge { .. } => 9, + Self::Time { .. } => 10, + Self::StartTimestamp(_) => 11, // show timestamps last + Self::EndTimestamp(_) => 12, + Self::Custom { .. } => 13, } } @@ -700,6 +798,11 @@ impl Display for MetricValue { Self::StartTimestamp(timestamp) | Self::EndTimestamp(timestamp) => { write!(f, "{timestamp}") } + Self::PruningMetrics { + pruning_metrics, .. + } => { + write!(f, "{pruning_metrics}") + } Self::Custom { name, value } => { write!(f, "name:{name} {value}") } From b2db7abed0b4cea67b0376909daf8216b407663a Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Tue, 28 Oct 2025 17:56:39 +1100 Subject: [PATCH 029/157] minor: doc fixes for timestamp output format (#18315) Followup some doc fixes missed in #17888 --- datafusion/functions/src/datetime/to_local_time.rs | 10 +++++----- datafusion/functions/src/datetime/to_timestamp.rs | 2 +- docs/source/user-guide/sql/scalar_functions.md | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/datafusion/functions/src/datetime/to_local_time.rs b/datafusion/functions/src/datetime/to_local_time.rs index ccdb45c9b05f..82e862c2d1bc 100644 --- a/datafusion/functions/src/datetime/to_local_time.rs +++ b/datafusion/functions/src/datetime/to_local_time.rs @@ -67,11 +67,11 @@ use datafusion_macros::user_doc; FROM ( SELECT '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' AS time ); -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ -| time | type | to_local_time | to_local_time_type | -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ -| 2024-04-01T00:00:20+02:00 | Timestamp(Nanosecond, Some("Europe/Brussels")) | 2024-04-01T00:00:20 | Timestamp(Nanosecond, None) | -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ ++---------------------------+----------------------------------+---------------------+--------------------+ +| time | type | to_local_time | to_local_time_type | ++---------------------------+----------------------------------+---------------------+--------------------+ +| 2024-04-01T00:00:20+02:00 | Timestamp(ns, "Europe/Brussels") | 2024-04-01T00:00:20 | Timestamp(ns) | ++---------------------------+----------------------------------+---------------------+--------------------+ # combine `to_local_time()` with `date_bin()` to bin on boundaries in the timezone rather # than UTC boundaries diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs index dcd52aa07be3..0a0700097770 100644 --- a/datafusion/functions/src/datetime/to_timestamp.rs +++ b/datafusion/functions/src/datetime/to_timestamp.rs @@ -38,7 +38,7 @@ use datafusion_macros::user_doc; description = r#" Converts a value to a timestamp (`YYYY-MM-DDT00:00:00Z`). Supports strings, integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats] are provided. Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp. -Note: `to_timestamp` returns `Timestamp(Nanosecond)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds. +Note: `to_timestamp` returns `Timestamp(ns)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds. "#, syntax_example = "to_timestamp(expression[, ..., format_n])", sql_example = r#"```sql diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index da1982acebe9..77ef831eeb0a 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -2780,11 +2780,11 @@ to_local_time(expression) FROM ( SELECT '2024-04-01T00:00:20Z'::timestamp AT TIME ZONE 'Europe/Brussels' AS time ); -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ -| time | type | to_local_time | to_local_time_type | -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ -| 2024-04-01T00:00:20+02:00 | Timestamp(Nanosecond, Some("Europe/Brussels")) | 2024-04-01T00:00:20 | Timestamp(Nanosecond, None) | -+---------------------------+------------------------------------------------+---------------------+-----------------------------+ ++---------------------------+----------------------------------+---------------------+--------------------+ +| time | type | to_local_time | to_local_time_type | ++---------------------------+----------------------------------+---------------------+--------------------+ +| 2024-04-01T00:00:20+02:00 | Timestamp(ns, "Europe/Brussels") | 2024-04-01T00:00:20 | Timestamp(ns) | ++---------------------------+----------------------------------+---------------------+--------------------+ # combine `to_local_time()` with `date_bin()` to bin on boundaries in the timezone rather # than UTC boundaries @@ -2808,7 +2808,7 @@ FROM ( Converts a value to a timestamp (`YYYY-MM-DDT00:00:00Z`). Supports strings, integer, unsigned integer, and double types as input. Strings are parsed as RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono formats] are provided. Integers, unsigned integers, and doubles are interpreted as seconds since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp. -Note: `to_timestamp` returns `Timestamp(Nanosecond)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds. +Note: `to_timestamp` returns `Timestamp(ns)`. The supported range for integer input is between `-9223372037` and `9223372036`. Supported range for string input is between `1677-09-21T00:12:44.0` and `2262-04-11T23:47:16.0`. Please use `to_timestamp_seconds` for the input outside of supported bounds. ```sql to_timestamp(expression[, ..., format_n]) From 1e4d25d2cd70143c2993742859a5eb09af2b3532 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Tue, 28 Oct 2025 16:00:20 +0800 Subject: [PATCH 030/157] minor: Add documentation to function `concat_elements_utf8view` (#18316) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes #. ## Rationale for this change Noticed this function when reviewing https://github.com/apache/datafusion/pull/18313. I think it’s a good opportunity to add more documentation. ## What changes are included in this PR? ## Are these changes tested? ## Are there any user-facing changes? --- datafusion/physical-expr/src/expressions/binary/kernels.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/physical-expr/src/expressions/binary/kernels.rs b/datafusion/physical-expr/src/expressions/binary/kernels.rs index ff833c17cdcb..6c96975ed644 100644 --- a/datafusion/physical-expr/src/expressions/binary/kernels.rs +++ b/datafusion/physical-expr/src/expressions/binary/kernels.rs @@ -141,6 +141,12 @@ create_left_integral_dyn_scalar_kernel!( bitwise_shift_left_scalar ); +/// Concatenates two `StringViewArray`s element-wise. +/// If either element is `Null`, the result element is also `Null`. +/// +/// # Errors +/// - Returns an error if the input arrays have different lengths. +/// - Returns an error if any concatenated string exceeds `u32::MAX` (≈4 GB) in length. pub fn concat_elements_utf8view( left: &StringViewArray, right: &StringViewArray, From 3cdcec39339763f927e08f07c0b67cde0c76e7a2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 28 Oct 2025 20:07:10 +1100 Subject: [PATCH 031/157] chore(deps): bump taiki-e/install-action from 2.62.38 to 2.62.40 (#18318) Bumps [taiki-e/install-action](https://github.com/taiki-e/install-action) from 2.62.38 to 2.62.40.
Release notes

Sourced from taiki-e/install-action's releases.

2.62.40

  • Update wasm-bindgen@latest to 0.2.105.

2.62.39

  • Update vacuum@latest to 0.19.1.

  • Update cargo-shear@latest to 1.6.1.

  • Update cargo-binstall@latest to 1.15.9.

  • Update mise@latest to 2025.10.18.

Changelog

Sourced from taiki-e/install-action's changelog.

Changelog

All notable changes to this project will be documented in this file.

This project adheres to Semantic Versioning.

[Unreleased]

[2.62.40] - 2025-10-28

  • Update wasm-bindgen@latest to 0.2.105.

[2.62.39] - 2025-10-27

  • Update vacuum@latest to 0.19.1.

  • Update cargo-shear@latest to 1.6.1.

  • Update cargo-binstall@latest to 1.15.9.

  • Update mise@latest to 2025.10.18.

[2.62.38] - 2025-10-25

  • Update coreutils@latest to 0.3.0.

  • Update wasmtime@latest to 38.0.3.

  • Update mise@latest to 2025.10.17.

  • Update cargo-tarpaulin@latest to 0.34.1.

[2.62.37] - 2025-10-24

  • Update cargo-binstall@latest to 1.15.8.

  • Update zizmor@latest to 1.16.0.

  • Update mise@latest to 2025.10.16.

[2.62.36] - 2025-10-23

  • Update syft@latest to 1.36.0.

  • Update vacuum@latest to 0.19.0.

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=taiki-e/install-action&package-manager=github_actions&previous-version=2.62.38&new-version=2.62.40)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/audit.yml | 2 +- .github/workflows/rust.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index a77ca501976f..40d4d4cfa380 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Install cargo-audit - uses: taiki-e/install-action@c5b1b6f479c32f356cc6f4ba672a47f63853b13b # v2.62.38 + uses: taiki-e/install-action@41ef8c65f4034ff24ab1cc2cef52f3000bcf9523 # v2.62.40 with: tool: cargo-audit - name: Run audit check diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4b61a04bfb14..7019de0b7507 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -425,7 +425,7 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@c5b1b6f479c32f356cc6f4ba672a47f63853b13b # v2.62.38 + uses: taiki-e/install-action@41ef8c65f4034ff24ab1cc2cef52f3000bcf9523 # v2.62.40 with: tool: wasm-pack - name: Run tests with headless mode @@ -752,7 +752,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@c5b1b6f479c32f356cc6f4ba672a47f63853b13b # v2.62.38 + uses: taiki-e/install-action@41ef8c65f4034ff24ab1cc2cef52f3000bcf9523 # v2.62.40 with: tool: cargo-msrv From 66fc1f9109991b56d4955aabb41ef31f6f2fe03a Mon Sep 17 00:00:00 2001 From: bubulalabu Date: Tue, 28 Oct 2025 10:10:41 +0100 Subject: [PATCH 032/157] Add PostgreSQL-style named arguments support for scalar functions (#18019) ## Which issue does this PR close? Addresses one portion of #17379. ## Rationale for this change PostgreSQL supports named arguments for function calls using the syntax `function_name(param => value)`, which improves code readability and allows arguments to be specified in any order. DataFusion should support this syntax to enhance the user experience, especially for functions with many optional parameters. ## What changes are included in this PR? This PR implements PostgreSQL-style named arguments for scalar functions. **Features:** - Parse named arguments from SQL (param => value syntax) - Resolve named arguments to positional order before execution - Support mixed positional and named arguments - Store parameter names in function signatures - Show parameter names in error messages **Limitations:** - Named arguments only work for functions with known arity (fixed number of parameters) - Variadic functions (like `concat`) cannot use named arguments as they accept variable numbers of arguments - Supported signature types: `Exact`, `Uniform`, `Any`, `Coercible`, `Comparable`, `Numeric`, `String`, `Nullary`, `ArraySignature`, `UserDefined`, and `OneOf` (combinations of these) - Not supported: `Variadic`, `VariadicAny` **Implementation:** - Added argument resolution logic with validation - Extended Signature with parameter_names field - Updated SQL parser to handle named argument syntax - Integrated into physical planning phase - Added comprehensive tests and documentation **Example usage:** ```sql -- All named arguments SELECT substr(str => 'hello world', start_pos => 7, length => 5); -- Mixed positional and named arguments SELECT substr('hello world', start_pos => 7, length => 5); -- Named arguments in any order SELECT substr(length => 5, str => 'hello world', start_pos => 7); ``` **Improved error messages:** Before this PR, error messages showed generic types: ``` Candidate functions: substr(Any, Any) substr(Any, Any, Any) ``` After this PR, error messages show parameter names: ``` Candidate functions: substr(str, start_pos) substr(str, start_pos, length) ``` Example error output: ``` datafusion % target/debug/datafusion-cli DataFusion CLI v50.1.0 > SELECT substr(str => 'hello world'); Error during planning: Execution error: Function 'substr' user-defined coercion failed with "Error during planning: The substr function requires 2 or 3 arguments, but got 1.". No function matches the given name and argument types 'substr(Utf8)'. You might need to add explicit type casts. Candidate functions: substr(str, start_pos, length) ``` Note: The function shows all parameters including optional ones for UserDefined signatures. The error message "requires 2 or 3 arguments" indicates that `length` is optional. ## Are these changes tested? Yes, comprehensive tests are included: 1. **Unit tests** (18 tests total): - Argument validation and reordering logic (8 tests in `udf.rs`) - Error message formatting with parameter names (2 tests in `utils.rs`) - TypeSignature parameter name support for all fixed-arity variants including ArraySignature (10 tests in `signature.rs`) 2. **Integration tests** (`named_arguments.slt`): - Positional arguments (baseline) - Named arguments in order - Named arguments out of order - Mixed positional and named arguments - Optional parameters - Function aliases - Error cases (positional after named, unknown parameter, duplicate parameter) - Error message format verification All tests pass successfully. ## Are there any user-facing changes? **Yes**, this PR adds new user-facing functionality: 1. **New SQL syntax**: Users can now call functions with named arguments using `param => value` syntax (only for functions with fixed arity) 2. **Improved error messages**: Signature mismatch errors now display parameter names instead of generic types 3. **UDF API**: Function authors can add parameter names to their functions using: ```rust signature: Signature::uniform(2, vec![DataType::Float64], Volatility::Immutable) .with_parameter_names(vec!["base".to_string(), "exponent".to_string()]) .expect("valid parameter names") ``` **Potential breaking change** (very unlikely): Added new public field `parameter_names: Option>` to `Signature` struct. This is technically a breaking change if code constructs `Signature` using struct literal syntax. However, this is extremely unlikely in practice because: - `Signature` is almost always constructed using builder methods (`Signature::exact()`, `Signature::uniform()`, etc.) - The new field defaults to `None`, maintaining existing behavior - Existing code using builder methods continues to work without modification **No other breaking changes**: The feature is purely additive - existing SQL queries and UDF implementations work without modification. --- datafusion/expr-common/src/signature.rs | 756 +++++++++++++++++- datafusion/expr/src/arguments.rs | 285 +++++++ datafusion/expr/src/lib.rs | 1 + datafusion/expr/src/utils.rs | 51 +- datafusion/functions-nested/src/replace.rs | 3 + datafusion/functions/src/unicode/substr.rs | 8 +- datafusion/sql/src/expr/function.rs | 99 ++- .../src/engines/postgres_engine/mod.rs | 4 +- .../test_files/named_arguments.slt | 139 ++++ .../functions/adding-udfs.md | 113 +++ 10 files changed, 1440 insertions(+), 19 deletions(-) create mode 100644 datafusion/expr/src/arguments.rs create mode 100644 datafusion/sqllogictest/test_files/named_arguments.slt diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 5fd4518e2e57..38eef077c5af 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -22,9 +22,9 @@ use std::hash::Hash; use crate::type_coercion::aggregates::NUMERICS; use arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; -use datafusion_common::internal_err; use datafusion_common::types::{LogicalType, LogicalTypeRef, NativeType}; use datafusion_common::utils::ListCoercion; +use datafusion_common::{internal_err, plan_err, Result}; use indexmap::IndexSet; use itertools::Itertools; @@ -84,6 +84,15 @@ pub enum Volatility { Volatile, } +/// Represents the arity (number of arguments) of a function signature +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Arity { + /// Fixed number of arguments + Fixed(usize), + /// Variable number of arguments (e.g., Variadic, VariadicAny, UserDefined) + Variable, +} + /// The types of arguments for which a function has implementations. /// /// [`TypeSignature`] **DOES NOT** define the types that a user query could call the @@ -245,6 +254,69 @@ impl TypeSignature { pub fn is_one_of(&self) -> bool { matches!(self, TypeSignature::OneOf(_)) } + + /// Returns the arity (expected number of arguments) for this type signature. + /// + /// Returns `Arity::Fixed(n)` for signatures with a specific argument count, + /// or `Arity::Variable` for variable-arity signatures like `Variadic`, `VariadicAny`, `UserDefined`. + /// + /// # Examples + /// + /// ``` + /// # use datafusion_expr_common::signature::{TypeSignature, Arity}; + /// # use arrow::datatypes::DataType; + /// // Exact signature has fixed arity + /// let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]); + /// assert_eq!(sig.arity(), Arity::Fixed(2)); + /// + /// // Variadic signature has variable arity + /// let sig = TypeSignature::VariadicAny; + /// assert_eq!(sig.arity(), Arity::Variable); + /// ``` + pub fn arity(&self) -> Arity { + match self { + TypeSignature::Exact(types) => Arity::Fixed(types.len()), + TypeSignature::Uniform(count, _) => Arity::Fixed(*count), + TypeSignature::Numeric(count) => Arity::Fixed(*count), + TypeSignature::String(count) => Arity::Fixed(*count), + TypeSignature::Comparable(count) => Arity::Fixed(*count), + TypeSignature::Any(count) => Arity::Fixed(*count), + TypeSignature::Coercible(types) => Arity::Fixed(types.len()), + TypeSignature::Nullary => Arity::Fixed(0), + TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments, + .. + }) => Arity::Fixed(arguments.len()), + TypeSignature::ArraySignature(ArrayFunctionSignature::RecursiveArray) => { + Arity::Fixed(1) + } + TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray) => { + Arity::Fixed(1) + } + TypeSignature::OneOf(variants) => { + // If any variant is Variable, the whole OneOf is Variable + let has_variable = variants.iter().any(|v| v.arity() == Arity::Variable); + if has_variable { + return Arity::Variable; + } + // Otherwise, get max arity from all fixed arity variants + let max_arity = variants + .iter() + .filter_map(|v| match v.arity() { + Arity::Fixed(n) => Some(n), + Arity::Variable => None, + }) + .max(); + match max_arity { + Some(n) => Arity::Fixed(n), + None => Arity::Variable, + } + } + TypeSignature::Variadic(_) + | TypeSignature::VariadicAny + | TypeSignature::UserDefined => Arity::Variable, + } + } } /// Represents the class of types that can be used in a function signature. @@ -336,7 +408,7 @@ impl TypeSignatureClass { &self, native_type: &NativeType, origin_type: &DataType, - ) -> datafusion_common::Result { + ) -> Result { match self { TypeSignatureClass::Native(logical_type) => { logical_type.native().default_cast_for(origin_type) @@ -486,6 +558,174 @@ impl TypeSignature { } } + /// Return string representation of the function signature with parameter names. + /// + /// This method is similar to [`Self::to_string_repr`] but uses parameter names + /// instead of types when available. This is useful for generating more helpful + /// error messages. + /// + /// # Arguments + /// * `parameter_names` - Optional slice of parameter names. When provided, these + /// names will be used instead of type names in the output. + /// + /// # Examples + /// ``` + /// # use datafusion_expr_common::signature::TypeSignature; + /// # use arrow::datatypes::DataType; + /// let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]); + /// + /// // Without names: shows types only + /// assert_eq!(sig.to_string_repr_with_names(None), vec!["Int32, Utf8"]); + /// + /// // With names: shows parameter names with types + /// assert_eq!( + /// sig.to_string_repr_with_names(Some(&["id".to_string(), "name".to_string()])), + /// vec!["id: Int32, name: Utf8"] + /// ); + /// ``` + pub fn to_string_repr_with_names( + &self, + parameter_names: Option<&[String]>, + ) -> Vec { + match self { + TypeSignature::Exact(types) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .zip(types.iter()) + .map(|(name, typ)| format!("{name}: {typ}")) + .collect::>() + .join(", ")] + } else { + vec![Self::join_types(types, ", ")] + } + } + TypeSignature::Any(count) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .take(*count) + .map(|name| format!("{name}: Any")) + .collect::>() + .join(", ")] + } else { + vec![std::iter::repeat_n("Any", *count) + .collect::>() + .join(", ")] + } + } + TypeSignature::Uniform(count, types) => { + if let Some(names) = parameter_names { + let type_str = Self::join_types(types, "/"); + vec![names + .iter() + .take(*count) + .map(|name| format!("{name}: {type_str}")) + .collect::>() + .join(", ")] + } else { + self.to_string_repr() + } + } + TypeSignature::Coercible(coercions) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .zip(coercions.iter()) + .map(|(name, coercion)| format!("{name}: {coercion}")) + .collect::>() + .join(", ")] + } else { + vec![Self::join_types(coercions, ", ")] + } + } + TypeSignature::Comparable(count) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .take(*count) + .map(|name| format!("{name}: Comparable")) + .collect::>() + .join(", ")] + } else { + self.to_string_repr() + } + } + TypeSignature::Numeric(count) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .take(*count) + .map(|name| format!("{name}: Numeric")) + .collect::>() + .join(", ")] + } else { + self.to_string_repr() + } + } + TypeSignature::String(count) => { + if let Some(names) = parameter_names { + vec![names + .iter() + .take(*count) + .map(|name| format!("{name}: String")) + .collect::>() + .join(", ")] + } else { + self.to_string_repr() + } + } + TypeSignature::Nullary => self.to_string_repr(), + TypeSignature::ArraySignature(array_sig) => { + if let Some(names) = parameter_names { + match array_sig { + ArrayFunctionSignature::Array { arguments, .. } => { + vec![names + .iter() + .zip(arguments.iter()) + .map(|(name, arg_type)| format!("{name}: {arg_type}")) + .collect::>() + .join(", ")] + } + ArrayFunctionSignature::RecursiveArray => { + vec![names + .iter() + .take(1) + .map(|name| format!("{name}: recursive_array")) + .collect::>() + .join(", ")] + } + ArrayFunctionSignature::MapArray => { + vec![names + .iter() + .take(1) + .map(|name| format!("{name}: map_array")) + .collect::>() + .join(", ")] + } + } + } else { + self.to_string_repr() + } + } + TypeSignature::OneOf(sigs) => sigs + .iter() + .flat_map(|s| s.to_string_repr_with_names(parameter_names)) + .collect(), + TypeSignature::UserDefined => { + if let Some(names) = parameter_names { + vec![names.join(", ")] + } else { + self.to_string_repr() + } + } + // Variable arity signatures cannot use parameter names + TypeSignature::Variadic(_) | TypeSignature::VariadicAny => { + self.to_string_repr() + } + } + } + /// Helper function to join types with specified delimiter. pub fn join_types(types: &[T], delimiter: &str) -> String { types @@ -804,6 +1044,13 @@ pub struct Signature { pub type_signature: TypeSignature, /// The volatility of the function. See [Volatility] for more information. pub volatility: Volatility, + /// Optional parameter names for the function arguments. + /// + /// If provided, enables named argument notation for function calls (e.g., `func(a => 1, b => 2)`). + /// The length must match the number of arguments defined by `type_signature`. + /// + /// Defaults to `None`, meaning only positional arguments are supported. + pub parameter_names: Option>, } impl Signature { @@ -812,6 +1059,7 @@ impl Signature { Signature { type_signature, volatility, + parameter_names: None, } } /// An arbitrary number of arguments with the same type, from those listed in `common_types`. @@ -819,6 +1067,7 @@ impl Signature { Self { type_signature: TypeSignature::Variadic(common_types), volatility, + parameter_names: None, } } /// User-defined coercion rules for the function. @@ -826,6 +1075,7 @@ impl Signature { Self { type_signature: TypeSignature::UserDefined, volatility, + parameter_names: None, } } @@ -834,6 +1084,7 @@ impl Signature { Self { type_signature: TypeSignature::Numeric(arg_count), volatility, + parameter_names: None, } } @@ -842,6 +1093,7 @@ impl Signature { Self { type_signature: TypeSignature::String(arg_count), volatility, + parameter_names: None, } } @@ -850,6 +1102,7 @@ impl Signature { Self { type_signature: TypeSignature::VariadicAny, volatility, + parameter_names: None, } } /// A fixed number of arguments of the same type, from those listed in `valid_types`. @@ -861,6 +1114,7 @@ impl Signature { Self { type_signature: TypeSignature::Uniform(arg_count, valid_types), volatility, + parameter_names: None, } } /// Exactly matches the types in `exact_types`, in order. @@ -868,6 +1122,7 @@ impl Signature { Signature { type_signature: TypeSignature::Exact(exact_types), volatility, + parameter_names: None, } } @@ -876,6 +1131,7 @@ impl Signature { Self { type_signature: TypeSignature::Coercible(target_types), volatility, + parameter_names: None, } } @@ -884,6 +1140,7 @@ impl Signature { Self { type_signature: TypeSignature::Comparable(arg_count), volatility, + parameter_names: None, } } @@ -891,6 +1148,7 @@ impl Signature { Signature { type_signature: TypeSignature::Nullary, volatility, + parameter_names: None, } } @@ -899,6 +1157,7 @@ impl Signature { Signature { type_signature: TypeSignature::Any(arg_count), volatility, + parameter_names: None, } } @@ -907,6 +1166,7 @@ impl Signature { Signature { type_signature: TypeSignature::OneOf(type_signatures), volatility, + parameter_names: None, } } @@ -923,6 +1183,7 @@ impl Signature { }, ), volatility, + parameter_names: None, } } @@ -939,6 +1200,7 @@ impl Signature { }, ), volatility, + parameter_names: None, } } @@ -956,6 +1218,7 @@ impl Signature { }, ), volatility, + parameter_names: None, } } @@ -980,6 +1243,7 @@ impl Signature { }), ]), volatility, + parameter_names: None, } } @@ -996,6 +1260,7 @@ impl Signature { }, ), volatility, + parameter_names: None, } } @@ -1003,13 +1268,72 @@ impl Signature { pub fn array(volatility: Volatility) -> Self { Signature::arrays(1, Some(ListCoercion::FixedSizedListToList), volatility) } + + /// Add parameter names to this signature, enabling named argument notation. + /// + /// # Example + /// ``` + /// # use datafusion_expr_common::signature::{Signature, Volatility}; + /// # use arrow::datatypes::DataType; + /// let sig = Signature::exact(vec![DataType::Int32, DataType::Utf8], Volatility::Immutable) + /// .with_parameter_names(vec!["count".to_string(), "name".to_string()]); + /// ``` + /// + /// # Errors + /// Returns an error if the number of parameter names doesn't match the signature's arity. + /// For signatures with variable arity (e.g., `Variadic`, `VariadicAny`), parameter names + /// cannot be specified. + pub fn with_parameter_names(mut self, names: Vec>) -> Result { + let names = names.into_iter().map(Into::into).collect::>(); + // Validate that the number of names matches the signature + self.validate_parameter_names(&names)?; + self.parameter_names = Some(names); + Ok(self) + } + + /// Validate that parameter names are compatible with this signature + fn validate_parameter_names(&self, names: &[String]) -> Result<()> { + match self.type_signature.arity() { + Arity::Fixed(expected) => { + if names.len() != expected { + return plan_err!( + "Parameter names count ({}) does not match signature arity ({})", + names.len(), + expected + ); + } + } + Arity::Variable => { + // For UserDefined signatures, allow parameter names + // The function implementer is responsible for validating the names match the actual arguments + if !matches!(self.type_signature, TypeSignature::UserDefined) { + return plan_err!( + "Cannot specify parameter names for variable arity signature: {:?}", + self.type_signature + ); + } + } + } + + let mut seen = std::collections::HashSet::new(); + for name in names { + if !seen.insert(name) { + return plan_err!("Duplicate parameter name: '{}'", name); + } + } + + Ok(()) + } } #[cfg(test)] mod tests { - use datafusion_common::types::{logical_int64, logical_string}; + use datafusion_common::types::{logical_int32, logical_int64, logical_string}; use super::*; + use crate::signature::{ + ArrayFunctionArgument, ArrayFunctionSignature, Coercion, TypeSignatureClass, + }; #[test] fn supports_zero_argument_tests() { @@ -1167,4 +1491,430 @@ mod tests { ] ); } + + #[test] + fn test_signature_with_parameter_names() { + let sig = Signature::exact( + vec![DataType::Int32, DataType::Utf8], + Volatility::Immutable, + ) + .with_parameter_names(vec!["count".to_string(), "name".to_string()]) + .unwrap(); + + assert_eq!( + sig.parameter_names, + Some(vec!["count".to_string(), "name".to_string()]) + ); + assert_eq!( + sig.type_signature, + TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]) + ); + } + + #[test] + fn test_signature_parameter_names_wrong_count() { + let result = Signature::exact( + vec![DataType::Int32, DataType::Utf8], + Volatility::Immutable, + ) + .with_parameter_names(vec!["count".to_string()]); // Only 1 name for 2 args + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("does not match signature arity")); + } + + #[test] + fn test_signature_parameter_names_duplicate() { + let result = Signature::exact( + vec![DataType::Int32, DataType::Int32], + Volatility::Immutable, + ) + .with_parameter_names(vec!["count".to_string(), "count".to_string()]); + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Duplicate parameter name")); + } + + #[test] + fn test_signature_parameter_names_variadic() { + let result = Signature::variadic(vec![DataType::Int32], Volatility::Immutable) + .with_parameter_names(vec!["arg".to_string()]); + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("variable arity signature")); + } + + #[test] + fn test_signature_without_parameter_names() { + let sig = Signature::exact( + vec![DataType::Int32, DataType::Utf8], + Volatility::Immutable, + ); + + assert_eq!(sig.parameter_names, None); + } + + #[test] + fn test_signature_uniform_with_parameter_names() { + let sig = Signature::uniform(3, vec![DataType::Float64], Volatility::Immutable) + .with_parameter_names(vec!["x".to_string(), "y".to_string(), "z".to_string()]) + .unwrap(); + + assert_eq!( + sig.parameter_names, + Some(vec!["x".to_string(), "y".to_string(), "z".to_string()]) + ); + } + + #[test] + fn test_signature_numeric_with_parameter_names() { + let sig = Signature::numeric(2, Volatility::Immutable) + .with_parameter_names(vec!["a".to_string(), "b".to_string()]) + .unwrap(); + + assert_eq!( + sig.parameter_names, + Some(vec!["a".to_string(), "b".to_string()]) + ); + } + + #[test] + fn test_signature_nullary_with_empty_names() { + let sig = Signature::nullary(Volatility::Immutable) + .with_parameter_names(Vec::::new()) + .unwrap(); + + assert_eq!(sig.parameter_names, Some(vec![])); + } + + #[test] + fn test_to_string_repr_with_names_exact() { + let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]); + + assert_eq!(sig.to_string_repr_with_names(None), vec!["Int32, Utf8"]); + + let names = vec!["id".to_string(), "name".to_string()]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["id: Int32, name: Utf8"] + ); + } + + #[test] + fn test_to_string_repr_with_names_any() { + let sig = TypeSignature::Any(3); + + assert_eq!(sig.to_string_repr_with_names(None), vec!["Any, Any, Any"]); + + let names = vec!["x".to_string(), "y".to_string(), "z".to_string()]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["x: Any, y: Any, z: Any"] + ); + } + + #[test] + fn test_to_string_repr_with_names_one_of() { + let sig = + TypeSignature::OneOf(vec![TypeSignature::Any(2), TypeSignature::Any(3)]); + + assert_eq!( + sig.to_string_repr_with_names(None), + vec!["Any, Any", "Any, Any, Any"] + ); + + let names = vec![ + "str".to_string(), + "start_pos".to_string(), + "length".to_string(), + ]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec![ + "str: Any, start_pos: Any", + "str: Any, start_pos: Any, length: Any" + ] + ); + } + + #[test] + fn test_to_string_repr_with_names_partial() { + // This simulates providing max arity names for a OneOf signature + let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]); + + // Provide 3 names for 2-parameter signature (extra name is ignored via zip) + let names = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["a: Int32, b: Utf8"] + ); + } + + #[test] + fn test_to_string_repr_with_names_uniform() { + let sig = TypeSignature::Uniform(2, vec![DataType::Float64]); + + assert_eq!( + sig.to_string_repr_with_names(None), + vec!["Float64, Float64"] + ); + + let names = vec!["x".to_string(), "y".to_string()]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["x: Float64, y: Float64"] + ); + } + + #[test] + fn test_to_string_repr_with_names_coercible() { + let sig = TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_int32())), + Coercion::new_exact(TypeSignatureClass::Native(logical_int32())), + ]); + + let names = vec!["a".to_string(), "b".to_string()]; + let result = sig.to_string_repr_with_names(Some(&names)); + // Check that it contains the parameter names with type annotations + assert_eq!(result.len(), 1); + assert!(result[0].starts_with("a: ")); + assert!(result[0].contains(", b: ")); + } + + #[test] + fn test_to_string_repr_with_names_comparable_numeric_string() { + let comparable = TypeSignature::Comparable(3); + let numeric = TypeSignature::Numeric(2); + let string_sig = TypeSignature::String(2); + + let names = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + + // All should show parameter names with type annotations + assert_eq!( + comparable.to_string_repr_with_names(Some(&names)), + vec!["a: Comparable, b: Comparable, c: Comparable"] + ); + assert_eq!( + numeric.to_string_repr_with_names(Some(&names)), + vec!["a: Numeric, b: Numeric"] + ); + assert_eq!( + string_sig.to_string_repr_with_names(Some(&names)), + vec!["a: String, b: String"] + ); + } + + #[test] + fn test_to_string_repr_with_names_variadic_fallback() { + let variadic = TypeSignature::Variadic(vec![DataType::Utf8, DataType::LargeUtf8]); + let names = vec!["x".to_string()]; + assert_eq!( + variadic.to_string_repr_with_names(Some(&names)), + variadic.to_string_repr() + ); + + let variadic_any = TypeSignature::VariadicAny; + assert_eq!( + variadic_any.to_string_repr_with_names(Some(&names)), + variadic_any.to_string_repr() + ); + + // UserDefined now shows parameter names when available + let user_defined = TypeSignature::UserDefined; + assert_eq!( + user_defined.to_string_repr_with_names(Some(&names)), + vec!["x"] + ); + assert_eq!( + user_defined.to_string_repr_with_names(None), + user_defined.to_string_repr() + ); + } + + #[test] + fn test_to_string_repr_with_names_nullary() { + let sig = TypeSignature::Nullary; + let names = vec!["x".to_string()]; + + // Should return empty representation, names don't apply + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["NullAry()"] + ); + assert_eq!(sig.to_string_repr_with_names(None), vec!["NullAry()"]); + } + + #[test] + fn test_to_string_repr_with_names_array_signature() { + let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ + ArrayFunctionArgument::Array, + ArrayFunctionArgument::Index, + ArrayFunctionArgument::Element, + ], + array_coercion: None, + }); + + assert_eq!( + sig.to_string_repr_with_names(None), + vec!["array, index, element"] + ); + + let names = vec!["arr".to_string(), "idx".to_string(), "val".to_string()]; + assert_eq!( + sig.to_string_repr_with_names(Some(&names)), + vec!["arr: array, idx: index, val: element"] + ); + + let recursive = + TypeSignature::ArraySignature(ArrayFunctionSignature::RecursiveArray); + let names = vec!["array".to_string()]; + assert_eq!( + recursive.to_string_repr_with_names(Some(&names)), + vec!["array: recursive_array"] + ); + + // Test MapArray (1 argument) + let map_array = TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray); + let names = vec!["map".to_string()]; + assert_eq!( + map_array.to_string_repr_with_names(Some(&names)), + vec!["map: map_array"] + ); + } + + #[test] + fn test_type_signature_arity_exact() { + let sig = TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]); + assert_eq!(sig.arity(), Arity::Fixed(2)); + + let sig = TypeSignature::Exact(vec![]); + assert_eq!(sig.arity(), Arity::Fixed(0)); + } + + #[test] + fn test_type_signature_arity_uniform() { + let sig = TypeSignature::Uniform(3, vec![DataType::Float64]); + assert_eq!(sig.arity(), Arity::Fixed(3)); + + let sig = TypeSignature::Uniform(1, vec![DataType::Int32]); + assert_eq!(sig.arity(), Arity::Fixed(1)); + } + + #[test] + fn test_type_signature_arity_numeric() { + let sig = TypeSignature::Numeric(2); + assert_eq!(sig.arity(), Arity::Fixed(2)); + } + + #[test] + fn test_type_signature_arity_string() { + let sig = TypeSignature::String(3); + assert_eq!(sig.arity(), Arity::Fixed(3)); + } + + #[test] + fn test_type_signature_arity_comparable() { + let sig = TypeSignature::Comparable(2); + assert_eq!(sig.arity(), Arity::Fixed(2)); + } + + #[test] + fn test_type_signature_arity_any() { + let sig = TypeSignature::Any(4); + assert_eq!(sig.arity(), Arity::Fixed(4)); + } + + #[test] + fn test_type_signature_arity_coercible() { + let sig = TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_int32())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ]); + assert_eq!(sig.arity(), Arity::Fixed(2)); + } + + #[test] + fn test_type_signature_arity_nullary() { + let sig = TypeSignature::Nullary; + assert_eq!(sig.arity(), Arity::Fixed(0)); + } + + #[test] + fn test_type_signature_arity_array_signature() { + // Test Array variant with 2 arguments + let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ArrayFunctionArgument::Array, ArrayFunctionArgument::Index], + array_coercion: None, + }); + assert_eq!(sig.arity(), Arity::Fixed(2)); + + // Test Array variant with 3 arguments + let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::Array { + arguments: vec![ + ArrayFunctionArgument::Array, + ArrayFunctionArgument::Element, + ArrayFunctionArgument::Index, + ], + array_coercion: None, + }); + assert_eq!(sig.arity(), Arity::Fixed(3)); + + // Test RecursiveArray variant + let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::RecursiveArray); + assert_eq!(sig.arity(), Arity::Fixed(1)); + + // Test MapArray variant + let sig = TypeSignature::ArraySignature(ArrayFunctionSignature::MapArray); + assert_eq!(sig.arity(), Arity::Fixed(1)); + } + + #[test] + fn test_type_signature_arity_one_of_fixed() { + // OneOf with all fixed arity variants should return max arity + let sig = TypeSignature::OneOf(vec![ + TypeSignature::Exact(vec![DataType::Int32]), + TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]), + TypeSignature::Exact(vec![ + DataType::Int32, + DataType::Utf8, + DataType::Float64, + ]), + ]); + assert_eq!(sig.arity(), Arity::Fixed(3)); + } + + #[test] + fn test_type_signature_arity_one_of_variable() { + // OneOf with variable arity variant should return Variable + let sig = TypeSignature::OneOf(vec![ + TypeSignature::Exact(vec![DataType::Int32]), + TypeSignature::VariadicAny, + ]); + assert_eq!(sig.arity(), Arity::Variable); + } + + #[test] + fn test_type_signature_arity_variadic() { + let sig = TypeSignature::Variadic(vec![DataType::Int32]); + assert_eq!(sig.arity(), Arity::Variable); + + let sig = TypeSignature::VariadicAny; + assert_eq!(sig.arity(), Arity::Variable); + } + + #[test] + fn test_type_signature_arity_user_defined() { + let sig = TypeSignature::UserDefined; + assert_eq!(sig.arity(), Arity::Variable); + } } diff --git a/datafusion/expr/src/arguments.rs b/datafusion/expr/src/arguments.rs new file mode 100644 index 000000000000..5653993db98f --- /dev/null +++ b/datafusion/expr/src/arguments.rs @@ -0,0 +1,285 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Argument resolution logic for named function parameters + +use crate::Expr; +use datafusion_common::{plan_err, Result}; +use std::collections::HashMap; + +/// Resolves function arguments, handling named and positional notation. +/// +/// This function validates and reorders arguments to match the function's parameter names +/// when named arguments are used. +/// +/// # Rules +/// - All positional arguments must come before named arguments +/// - Named arguments can be in any order after positional arguments +/// - Parameter names follow SQL identifier rules: unquoted names are case-insensitive +/// (normalized to lowercase), quoted names are case-sensitive +/// - No duplicate parameter names allowed +/// +/// # Arguments +/// * `param_names` - The function's parameter names in order +/// * `args` - The argument expressions +/// * `arg_names` - Optional parameter name for each argument +/// +/// # Returns +/// A vector of expressions in the correct order matching the parameter names +/// +/// # Examples +/// ```text +/// Given parameters ["a", "b", "c"] +/// And call: func(10, c => 30, b => 20) +/// Returns: [Expr(10), Expr(20), Expr(30)] +/// ``` +pub fn resolve_function_arguments( + param_names: &[String], + args: Vec, + arg_names: Vec>, +) -> Result> { + if args.len() != arg_names.len() { + return plan_err!( + "Internal error: args length ({}) != arg_names length ({})", + args.len(), + arg_names.len() + ); + } + + // Check if all arguments are positional (fast path) + if arg_names.iter().all(|name| name.is_none()) { + return Ok(args); + } + + validate_argument_order(&arg_names)?; + + reorder_named_arguments(param_names, args, arg_names) +} + +/// Validates that positional arguments come before named arguments +fn validate_argument_order(arg_names: &[Option]) -> Result<()> { + let mut seen_named = false; + for (i, arg_name) in arg_names.iter().enumerate() { + match arg_name { + Some(_) => seen_named = true, + None if seen_named => { + return plan_err!( + "Positional argument at position {} follows named argument. \ + All positional arguments must come before named arguments.", + i + ); + } + None => {} + } + } + Ok(()) +} + +/// Reorders arguments based on named parameters to match signature order +fn reorder_named_arguments( + param_names: &[String], + args: Vec, + arg_names: Vec>, +) -> Result> { + // Build HashMap for O(1) parameter name lookups + let param_index_map: HashMap<&str, usize> = param_names + .iter() + .enumerate() + .map(|(idx, name)| (name.as_str(), idx)) + .collect(); + + let positional_count = arg_names.iter().filter(|n| n.is_none()).count(); + + // Capture args length before consuming the vector + let args_len = args.len(); + + let expected_arg_count = param_names.len(); + + if positional_count > expected_arg_count { + return plan_err!( + "Too many positional arguments: expected at most {}, got {}", + expected_arg_count, + positional_count + ); + } + + let mut result: Vec> = vec![None; expected_arg_count]; + + for (i, (arg, arg_name)) in args.into_iter().zip(arg_names).enumerate() { + if let Some(name) = arg_name { + // Named argument - O(1) lookup in HashMap + let param_index = + param_index_map.get(name.as_str()).copied().ok_or_else(|| { + datafusion_common::plan_datafusion_err!( + "Unknown parameter name '{}'. Valid parameters are: [{}]", + name, + param_names.join(", ") + ) + })?; + + if result[param_index].is_some() { + return plan_err!("Parameter '{}' specified multiple times", name); + } + + result[param_index] = Some(arg); + } else { + result[i] = Some(arg); + } + } + + // Only require parameters up to the number of arguments provided (supports optional parameters) + let required_count = args_len; + for i in 0..required_count { + if result[i].is_none() { + return plan_err!("Missing required parameter '{}'", param_names[i]); + } + } + + // Return only the assigned parameters (handles optional trailing parameters) + Ok(result.into_iter().take(required_count).flatten().collect()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::lit; + + #[test] + fn test_all_positional() { + let param_names = vec!["a".to_string(), "b".to_string()]; + + let args = vec![lit(1), lit("hello")]; + let arg_names = vec![None, None]; + + let result = + resolve_function_arguments(¶m_names, args.clone(), arg_names).unwrap(); + assert_eq!(result.len(), 2); + } + + #[test] + fn test_all_named() { + let param_names = vec!["a".to_string(), "b".to_string()]; + + let args = vec![lit(1), lit("hello")]; + let arg_names = vec![Some("a".to_string()), Some("b".to_string())]; + + let result = resolve_function_arguments(¶m_names, args, arg_names).unwrap(); + assert_eq!(result.len(), 2); + } + + #[test] + fn test_named_reordering() { + let param_names = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + + // Call with: func(c => 3.0, a => 1, b => "hello") + let args = vec![lit(3.0), lit(1), lit("hello")]; + let arg_names = vec![ + Some("c".to_string()), + Some("a".to_string()), + Some("b".to_string()), + ]; + + let result = resolve_function_arguments(¶m_names, args, arg_names).unwrap(); + + // Should be reordered to [a, b, c] = [1, "hello", 3.0] + assert_eq!(result.len(), 3); + assert_eq!(result[0], lit(1)); + assert_eq!(result[1], lit("hello")); + assert_eq!(result[2], lit(3.0)); + } + + #[test] + fn test_mixed_positional_and_named() { + let param_names = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + + // Call with: func(1, c => 3.0, b => "hello") + let args = vec![lit(1), lit(3.0), lit("hello")]; + let arg_names = vec![None, Some("c".to_string()), Some("b".to_string())]; + + let result = resolve_function_arguments(¶m_names, args, arg_names).unwrap(); + + // Should be reordered to [a, b, c] = [1, "hello", 3.0] + assert_eq!(result.len(), 3); + assert_eq!(result[0], lit(1)); + assert_eq!(result[1], lit("hello")); + assert_eq!(result[2], lit(3.0)); + } + + #[test] + fn test_positional_after_named_error() { + let param_names = vec!["a".to_string(), "b".to_string()]; + + // Call with: func(a => 1, "hello") - ERROR + let args = vec![lit(1), lit("hello")]; + let arg_names = vec![Some("a".to_string()), None]; + + let result = resolve_function_arguments(¶m_names, args, arg_names); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Positional argument")); + } + + #[test] + fn test_unknown_parameter_name() { + let param_names = vec!["a".to_string(), "b".to_string()]; + + // Call with: func(x => 1, b => "hello") - ERROR + let args = vec![lit(1), lit("hello")]; + let arg_names = vec![Some("x".to_string()), Some("b".to_string())]; + + let result = resolve_function_arguments(¶m_names, args, arg_names); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Unknown parameter")); + } + + #[test] + fn test_duplicate_parameter_name() { + let param_names = vec!["a".to_string(), "b".to_string()]; + + // Call with: func(a => 1, a => 2) - ERROR + let args = vec![lit(1), lit(2)]; + let arg_names = vec![Some("a".to_string()), Some("a".to_string())]; + + let result = resolve_function_arguments(¶m_names, args, arg_names); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("specified multiple times")); + } + + #[test] + fn test_missing_required_parameter() { + let param_names = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + + // Call with: func(a => 1, c => 3.0) - missing 'b' + let args = vec![lit(1), lit(3.0)]; + let arg_names = vec![Some("a".to_string()), Some("c".to_string())]; + + let result = resolve_function_arguments(¶m_names, args, arg_names); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Missing required parameter")); + } +} diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index 346d373ff5b4..2b7cc9d46ad3 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -44,6 +44,7 @@ mod udaf; mod udf; mod udwf; +pub mod arguments; pub mod conditional_expressions; pub mod execution_props; pub mod expr; diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index b91db4527b3a..74ba99847f70 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -936,7 +936,7 @@ pub fn generate_signature_error_msg( ) -> String { let candidate_signatures = func_signature .type_signature - .to_string_repr() + .to_string_repr_with_names(func_signature.parameter_names.as_deref()) .iter() .map(|args_str| format!("\t{func_name}({args_str})")) .collect::>() @@ -1295,6 +1295,7 @@ mod tests { Cast, ExprFunctionExt, WindowFunctionDefinition, }; use arrow::datatypes::{UnionFields, UnionMode}; + use datafusion_expr_common::signature::{TypeSignature, Volatility}; #[test] fn test_group_window_expr_by_sort_keys_empty_case() -> Result<()> { @@ -1714,4 +1715,52 @@ mod tests { DataType::List(Arc::new(Field::new("my_union", union_type, true))); assert!(!can_hash(&list_union_type)); } + + #[test] + fn test_generate_signature_error_msg_with_parameter_names() { + let sig = Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Utf8, DataType::Int64]), + TypeSignature::Exact(vec![ + DataType::Utf8, + DataType::Int64, + DataType::Int64, + ]), + ], + Volatility::Immutable, + ) + .with_parameter_names(vec![ + "str".to_string(), + "start_pos".to_string(), + "length".to_string(), + ]) + .expect("valid parameter names"); + + // Generate error message with only 1 argument provided + let error_msg = generate_signature_error_msg("substr", sig, &[DataType::Utf8]); + + assert!( + error_msg.contains("str: Utf8, start_pos: Int64"), + "Expected 'str: Utf8, start_pos: Int64' in error message, got: {error_msg}" + ); + assert!( + error_msg.contains("str: Utf8, start_pos: Int64, length: Int64"), + "Expected 'str: Utf8, start_pos: Int64, length: Int64' in error message, got: {error_msg}" + ); + } + + #[test] + fn test_generate_signature_error_msg_without_parameter_names() { + let sig = Signature::one_of( + vec![TypeSignature::Any(2), TypeSignature::Any(3)], + Volatility::Immutable, + ); + + let error_msg = generate_signature_error_msg("my_func", sig, &[DataType::Int32]); + + assert!( + error_msg.contains("Any, Any"), + "Expected 'Any, Any' without parameter names, got: {error_msg}" + ); + } } diff --git a/datafusion/functions-nested/src/replace.rs b/datafusion/functions-nested/src/replace.rs index 59f851a776a1..4314d41419bc 100644 --- a/datafusion/functions-nested/src/replace.rs +++ b/datafusion/functions-nested/src/replace.rs @@ -105,6 +105,7 @@ impl ArrayReplace { }, ), volatility: Volatility::Immutable, + parameter_names: None, }, aliases: vec![String::from("list_replace")], } @@ -186,6 +187,7 @@ impl ArrayReplaceN { }, ), volatility: Volatility::Immutable, + parameter_names: None, }, aliases: vec![String::from("list_replace_n")], } @@ -265,6 +267,7 @@ impl ArrayReplaceAll { }, ), volatility: Volatility::Immutable, + parameter_names: None, }, aliases: vec![String::from("list_replace_all")], } diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 0b35f664532d..46b3cc63d0b6 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -71,7 +71,13 @@ impl Default for SubstrFunc { impl SubstrFunc { pub fn new() -> Self { Self { - signature: Signature::user_defined(Volatility::Immutable), + signature: Signature::user_defined(Volatility::Immutable) + .with_parameter_names(vec![ + "str".to_string(), + "start_pos".to_string(), + "length".to_string(), + ]) + .expect("valid parameter names"), aliases: vec![String::from("substring")], } } diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index eabf645a5eaf..cb34bb0f7eb7 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -274,8 +274,28 @@ impl SqlToRel<'_, S> { } // User-defined function (UDF) should have precedence if let Some(fm) = self.context_provider.get_function_meta(&name) { - let args = self.function_args_to_expr(args, schema, planner_context)?; - let inner = ScalarFunction::new_udf(fm, args); + let (args, arg_names) = + self.function_args_to_expr_with_names(args, schema, planner_context)?; + + let resolved_args = if arg_names.iter().any(|name| name.is_some()) { + if let Some(param_names) = &fm.signature().parameter_names { + datafusion_expr::arguments::resolve_function_arguments( + param_names, + args, + arg_names, + )? + } else { + return plan_err!( + "Function '{}' does not support named arguments", + fm.name() + ); + } + } else { + args + }; + + // After resolution, all arguments are positional + let inner = ScalarFunction::new_udf(fm, resolved_args); if name.eq_ignore_ascii_case(inner.name()) { return Ok(Expr::ScalarFunction(inner)); @@ -624,14 +644,29 @@ impl SqlToRel<'_, S> { schema: &DFSchema, planner_context: &mut PlannerContext, ) -> Result { + let (expr, _) = + self.sql_fn_arg_to_logical_expr_with_name(sql, schema, planner_context)?; + Ok(expr) + } + + fn sql_fn_arg_to_logical_expr_with_name( + &self, + sql: FunctionArg, + schema: &DFSchema, + planner_context: &mut PlannerContext, + ) -> Result<(Expr, Option)> { match sql { FunctionArg::Named { - name: _, + name, arg: FunctionArgExpr::Expr(arg), operator: _, - } => self.sql_expr_to_logical_expr(arg, schema, planner_context), + } => { + let expr = self.sql_expr_to_logical_expr(arg, schema, planner_context)?; + let arg_name = crate::utils::normalize_ident(name); + Ok((expr, Some(arg_name))) + } FunctionArg::Named { - name: _, + name, arg: FunctionArgExpr::Wildcard, operator: _, } => { @@ -640,11 +675,12 @@ impl SqlToRel<'_, S> { qualifier: None, options: Box::new(WildcardOptions::default()), }; - - Ok(expr) + let arg_name = crate::utils::normalize_ident(name); + Ok((expr, Some(arg_name))) } FunctionArg::Unnamed(FunctionArgExpr::Expr(arg)) => { - self.sql_expr_to_logical_expr(arg, schema, planner_context) + let expr = self.sql_expr_to_logical_expr(arg, schema, planner_context)?; + Ok((expr, None)) } FunctionArg::Unnamed(FunctionArgExpr::Wildcard) => { #[expect(deprecated)] @@ -652,8 +688,7 @@ impl SqlToRel<'_, S> { qualifier: None, options: Box::new(WildcardOptions::default()), }; - - Ok(expr) + Ok((expr, None)) } FunctionArg::Unnamed(FunctionArgExpr::QualifiedWildcard(object_name)) => { let qualifier = self.object_name_to_table_reference(object_name)?; @@ -668,8 +703,30 @@ impl SqlToRel<'_, S> { qualifier: qualifier.into(), options: Box::new(WildcardOptions::default()), }; - - Ok(expr) + Ok((expr, None)) + } + // PostgreSQL dialect uses ExprNamed variant with expression for name + FunctionArg::ExprNamed { + name: SQLExpr::Identifier(name), + arg: FunctionArgExpr::Expr(arg), + operator: _, + } => { + let expr = self.sql_expr_to_logical_expr(arg, schema, planner_context)?; + let arg_name = crate::utils::normalize_ident(name); + Ok((expr, Some(arg_name))) + } + FunctionArg::ExprNamed { + name: SQLExpr::Identifier(name), + arg: FunctionArgExpr::Wildcard, + operator: _, + } => { + #[expect(deprecated)] + let expr = Expr::Wildcard { + qualifier: None, + options: Box::new(WildcardOptions::default()), + }; + let arg_name = crate::utils::normalize_ident(name); + Ok((expr, Some(arg_name))) } _ => not_impl_err!("Unsupported qualified wildcard argument: {sql:?}"), } @@ -686,6 +743,24 @@ impl SqlToRel<'_, S> { .collect::>>() } + pub(super) fn function_args_to_expr_with_names( + &self, + args: Vec, + schema: &DFSchema, + planner_context: &mut PlannerContext, + ) -> Result<(Vec, Vec>)> { + let results: Result)>> = args + .into_iter() + .map(|a| { + self.sql_fn_arg_to_logical_expr_with_name(a, schema, planner_context) + }) + .collect(); + + let pairs = results?; + let (exprs, names): (Vec, Vec>) = pairs.into_iter().unzip(); + Ok((exprs, names)) + } + pub(crate) fn check_unnest_arg(arg: &Expr, schema: &DFSchema) -> Result<()> { // Check argument type, array types are supported match arg.get_type(schema)? { diff --git a/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs b/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs index 375f06d34b44..4d310711687f 100644 --- a/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs +++ b/datafusion/sqllogictest/src/engines/postgres_engine/mod.rs @@ -76,8 +76,8 @@ impl Postgres { /// /// See https://docs.rs/tokio-postgres/latest/tokio_postgres/config/struct.Config.html#url for format pub async fn connect(relative_path: PathBuf, pb: ProgressBar) -> Result { - let uri = - std::env::var("PG_URI").map_or(PG_URI.to_string(), std::convert::identity); + let uri = std::env::var("PG_URI") + .map_or_else(|_| PG_URI.to_string(), std::convert::identity); info!("Using postgres connection string: {uri}"); diff --git a/datafusion/sqllogictest/test_files/named_arguments.slt b/datafusion/sqllogictest/test_files/named_arguments.slt new file mode 100644 index 000000000000..c93da7e7a8f9 --- /dev/null +++ b/datafusion/sqllogictest/test_files/named_arguments.slt @@ -0,0 +1,139 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############# +## Tests for Named Arguments (PostgreSQL-style param => value syntax) +############# + +# Test positional arguments still work (baseline) +query T +SELECT substr('hello world', 7, 5); +---- +world + +# Test named arguments in order +query T +SELECT substr(str => 'hello world', start_pos => 7, length => 5); +---- +world + +# Test named arguments out of order +query T +SELECT substr(length => 5, str => 'hello world', start_pos => 7); +---- +world + +# Test mixed positional and named arguments +query T +SELECT substr('hello world', start_pos => 7, length => 5); +---- +world + +# Test with only 2 parameters (length optional) +query T +SELECT substr(str => 'hello world', start_pos => 7); +---- +world + +# Test all parameters named with substring alias +query T +SELECT substring(str => 'hello', start_pos => 1, length => 3); +---- +hel + +# Error: positional argument after named argument +query error DataFusion error: Error during planning: Positional argument.*follows named argument +SELECT substr(str => 'hello', 1, 3); + +# Error: unknown parameter name +query error DataFusion error: Error during planning: Unknown parameter name 'invalid' +SELECT substr(invalid => 'hello', start_pos => 1, length => 3); + +# Error: duplicate parameter name +query error DataFusion error: Error during planning: Parameter 'str' specified multiple times +SELECT substr(str => 'hello', str => 'world', start_pos => 1); + +# Test case-insensitive parameter names (unquoted identifiers) +query T +SELECT substr(STR => 'hello world', START_POS => 7, LENGTH => 5); +---- +world + +# Test case-insensitive with mixed case +query T +SELECT substr(Str => 'hello world', Start_Pos => 7); +---- +world + +# Error: case-sensitive quoted parameter names don't match +query error DataFusion error: Error during planning: Unknown parameter name 'STR' +SELECT substr("STR" => 'hello world', "start_pos" => 7); + +# Error: wrong number of arguments +# This query provides only 1 argument but substr requires 2 or 3 +query error DataFusion error: Error during planning: Execution error: Function 'substr' user-defined coercion failed with "Error during planning: The substr function requires 2 or 3 arguments, but got 1." +SELECT substr(str => 'hello world'); + +############# +## PostgreSQL Dialect Tests (uses ExprNamed variant) +############# + +statement ok +set datafusion.sql_parser.dialect = 'PostgreSQL'; + +# Test named arguments in order +query T +SELECT substr(str => 'hello world', start_pos => 7, length => 5); +---- +world + +# Test named arguments out of order +query T +SELECT substr(length => 5, str => 'hello world', start_pos => 7); +---- +world + +# Test mixed positional and named arguments +query T +SELECT substr('hello world', start_pos => 7, length => 5); +---- +world + +# Test with only 2 parameters (length optional) +query T +SELECT substr(str => 'hello world', start_pos => 7); +---- +world + +# Reset to default dialect +statement ok +set datafusion.sql_parser.dialect = 'Generic'; + +############# +## MsSQL Dialect Tests (does NOT support => operator) +############# + +statement ok +set datafusion.sql_parser.dialect = 'MsSQL'; + +# Error: MsSQL dialect does not support => operator +query error DataFusion error: SQL error: ParserError\("Expected: \), found: => at Line: 1, Column: 19"\) +SELECT substr(str => 'hello world', start_pos => 7, length => 5); + +# Reset to default dialect +statement ok +set datafusion.sql_parser.dialect = 'Generic'; diff --git a/docs/source/library-user-guide/functions/adding-udfs.md b/docs/source/library-user-guide/functions/adding-udfs.md index ecb618179ea1..7581d8b6505e 100644 --- a/docs/source/library-user-guide/functions/adding-udfs.md +++ b/docs/source/library-user-guide/functions/adding-udfs.md @@ -586,6 +586,119 @@ For async UDF implementation details, see [`async_udf.rs`](https://github.com/ap [`process_scalar_func_inputs`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/functions/fn.process_scalar_func_inputs.html [`advanced_udf.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_udf.rs +## Named Arguments + +DataFusion supports PostgreSQL-style named arguments for scalar functions, allowing you to pass arguments by parameter name: + +```sql +SELECT substr(str => 'hello', start_pos => 2, length => 3); +``` + +Named arguments can be mixed with positional arguments, but positional arguments must come first: + +```sql +SELECT substr('hello', start_pos => 2, length => 3); -- Valid +``` + +### Implementing Functions with Named Arguments + +To support named arguments in your UDF, add parameter names to your function's signature using `.with_parameter_names()`: + +```rust +# use arrow::datatypes::DataType; +# use datafusion_expr::{Signature, Volatility}; +# +# #[derive(Debug)] +# struct MyFunction { +# signature: Signature, +# } +# +impl MyFunction { + fn new() -> Self { + Self { + signature: Signature::uniform( + 2, + vec![DataType::Float64], + Volatility::Immutable + ) + .with_parameter_names(vec![ + "base".to_string(), + "exponent".to_string() + ]) + .expect("valid parameter names"), + } + } +} +``` + +The parameter names should match the order of arguments in your function's signature. DataFusion automatically resolves named arguments to the correct positional order before invoking your function. + +### Example + +```rust +# use std::sync::Arc; +# use std::any::Any; +# use arrow::datatypes::DataType; +# use datafusion_common::Result; +# use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility}; +# use datafusion_expr::ScalarUDFImpl; + +#[derive(Debug, PartialEq, Eq, Hash)] +struct PowerFunction { + signature: Signature, +} + +impl PowerFunction { + fn new() -> Self { + Self { + signature: Signature::uniform( + 2, + vec![DataType::Float64], + Volatility::Immutable + ) + .with_parameter_names(vec![ + "base".to_string(), + "exponent".to_string() + ]) + .expect("valid parameter names"), + } + } +} + +impl ScalarUDFImpl for PowerFunction { + fn as_any(&self) -> &dyn Any { self } + fn name(&self) -> &str { "power" } + fn signature(&self) -> &Signature { &self.signature } + + fn return_type(&self, _args: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + // Your implementation - arguments are in correct positional order + unimplemented!() + } +} +``` + +Once registered, users can call your function with named arguments: + +```sql +SELECT power(base => 2.0, exponent => 3.0); +SELECT power(2.0, exponent => 3.0); +``` + +### Error Messages + +When a function call fails due to incorrect arguments, DataFusion will show the parameter names in error messages to help users: + +```text +No function matches the given name and argument types substr(Utf8). + Candidate functions: + substr(str: Any, start_pos: Any) + substr(str: Any, start_pos: Any, length: Any) +``` + ## Adding a Window UDF Scalar UDFs are functions that take a row of data and return a single value. Window UDFs are similar, but they also have From 5dc42f43a8c26e24175d14d4c4aeebc2da0b3e80 Mon Sep 17 00:00:00 2001 From: Qi Zhu Date: Tue, 28 Oct 2025 17:41:21 +0800 Subject: [PATCH 033/157] Change default prefetch_hint to 512Kb to reduce number of object store requests when reading parquet files (#18160) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …default (set metadata_size_hint) ## Which issue does this PR close? - Closes [#18118](https://github.com/apache/datafusion/issues/18118) ## Rationale for this change Reduce number of object store requests when reading parquet files by default (set metadata_size_hint) ## What changes are included in this PR? ```rust /// Default setting to 512 KB, which should be sufficient for most parquet files, /// it can reduce one I/O operation per parquet file. If the metadata is larger than /// the hint, two reads will still be performed. pub metadata_size_hint: Option, default = Some(512 * 1024) ``` ## Are these changes tested? Yes ## Are there any user-facing changes? No --------- Co-authored-by: Daniël Heres Co-authored-by: Andrew Lamb --- datafusion/common/src/config.rs | 5 +- .../src/datasource/file_format/options.rs | 14 + .../src/datasource/file_format/parquet.rs | 7 +- .../tests/datasource/object_store_access.rs | 291 +++++++++++++----- .../test_files/information_schema.slt | 4 +- docs/source/user-guide/configs.md | 2 +- 6 files changed, 239 insertions(+), 84 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 1713377f8d4d..10199db1a1de 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -621,7 +621,10 @@ config_namespace! { /// bytes of the parquet file optimistically. If not specified, two reads are required: /// One read to fetch the 8-byte parquet footer and /// another to fetch the metadata length encoded in the footer - pub metadata_size_hint: Option, default = None + /// Default setting to 512 KiB, which should be sufficient for most parquet files, + /// it can reduce one I/O operation per parquet file. If the metadata is larger than + /// the hint, two reads will still be performed. + pub metadata_size_hint: Option, default = Some(512 * 1024) /// (reading) If true, filter expressions are be applied during the parquet decoding operation to /// reduce the number of rows decoded. This optimization is sometimes called "late materialization". diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs index 8c1bb02ef073..e78c5f09553c 100644 --- a/datafusion/core/src/datasource/file_format/options.rs +++ b/datafusion/core/src/datasource/file_format/options.rs @@ -269,6 +269,8 @@ pub struct ParquetReadOptions<'a> { pub file_sort_order: Vec>, /// Properties for decryption of Parquet files that use modular encryption pub file_decryption_properties: Option, + /// Metadata size hint for Parquet files reading (in bytes) + pub metadata_size_hint: Option, } impl Default for ParquetReadOptions<'_> { @@ -281,6 +283,7 @@ impl Default for ParquetReadOptions<'_> { schema: None, file_sort_order: vec![], file_decryption_properties: None, + metadata_size_hint: None, } } } @@ -340,6 +343,12 @@ impl<'a> ParquetReadOptions<'a> { self.file_decryption_properties = Some(file_decryption_properties); self } + + /// Configure metadata size hint for Parquet files reading (in bytes) + pub fn metadata_size_hint(mut self, size_hint: Option) -> Self { + self.metadata_size_hint = size_hint; + self + } } /// Options that control the reading of ARROW files. @@ -606,6 +615,11 @@ impl ReadOptions<'_> for ParquetReadOptions<'_> { if let Some(file_decryption_properties) = &self.file_decryption_properties { options.crypto.file_decryption = Some(file_decryption_properties.clone()); } + // This can be overridden per-read in ParquetReadOptions, if setting. + if let Some(metadata_size_hint) = self.metadata_size_hint { + options.global.metadata_size_hint = Some(metadata_size_hint); + } + let mut file_format = ParquetFormat::new().with_options(options); if let Some(parquet_pruning) = self.parquet_pruning { diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 1781ea569d90..52c5393e1031 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -546,7 +546,8 @@ mod tests { let (files, _file_names) = store_parquet(vec![batch1], false).await?; let state = SessionContext::new().state(); - let format = ParquetFormat::default(); + // Make metadata size hint None to keep original behavior + let format = ParquetFormat::default().with_metadata_size_hint(None); let _schema = format.infer_schema(&state, &store.upcast(), &files).await?; assert_eq!(store.request_count(), 3); // No increase, cache being used. @@ -620,7 +621,9 @@ mod tests { let mut state = SessionContext::new().state(); state = set_view_state(state, force_views); - let format = ParquetFormat::default().with_force_view_types(force_views); + let format = ParquetFormat::default() + .with_force_view_types(force_views) + .with_metadata_size_hint(None); let schema = format.infer_schema(&state, &store.upcast(), &files).await?; assert_eq!(store.request_count(), 6); diff --git a/datafusion/core/tests/datasource/object_store_access.rs b/datafusion/core/tests/datasource/object_store_access.rs index 6b9585f408a1..d1592c21472d 100644 --- a/datafusion/core/tests/datasource/object_store_access.rs +++ b/datafusion/core/tests/datasource/object_store_access.rs @@ -27,7 +27,7 @@ use arrow::array::{ArrayRef, Int32Array, RecordBatch}; use async_trait::async_trait; use bytes::Bytes; -use datafusion::prelude::{CsvReadOptions, SessionContext}; +use datafusion::prelude::{CsvReadOptions, ParquetReadOptions, SessionContext}; use futures::stream::BoxStream; use insta::assert_snapshot; use object_store::memory::InMemory; @@ -45,8 +45,9 @@ use url::Url; #[tokio::test] async fn create_single_csv_file() { + let test = Test::new().with_single_file_csv().await; assert_snapshot!( - single_file_csv_test().await.requests(), + test.requests(), @r" RequestCountingObjectStore() Total Requests: 2 @@ -58,8 +59,9 @@ async fn create_single_csv_file() { #[tokio::test] async fn query_single_csv_file() { + let test = Test::new().with_single_file_csv().await; assert_snapshot!( - single_file_csv_test().await.query("select * from csv_table").await, + test.query("select * from csv_table").await, @r" ------- Query Output (2 rows) ------- +---------+-------+-------+ @@ -79,8 +81,9 @@ async fn query_single_csv_file() { #[tokio::test] async fn create_multi_file_csv_file() { + let test = Test::new().with_multi_file_csv().await; assert_snapshot!( - multi_file_csv_test().await.requests(), + test.requests(), @r" RequestCountingObjectStore() Total Requests: 4 @@ -94,8 +97,9 @@ async fn create_multi_file_csv_file() { #[tokio::test] async fn query_multi_csv_file() { + let test = Test::new().with_multi_file_csv().await; assert_snapshot!( - multi_file_csv_test().await.query("select * from csv_table").await, + test.query("select * from csv_table").await, @r" ------- Query Output (6 rows) ------- +---------+-------+-------+ @@ -120,24 +124,132 @@ async fn query_multi_csv_file() { } #[tokio::test] -async fn create_single_parquet_file() { +async fn create_single_parquet_file_default() { + // The default metadata size hint is 512KB + // which is enough to fetch the entire footer metadata and PageIndex + // in a single GET request. + let test = Test::new().with_single_file_parquet().await; + // expect 1 get request which reads the footer metadata and page index assert_snapshot!( - single_file_parquet_test().await.requests(), + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=parquet_table.parquet + - GET (range) range=0-2994 path=parquet_table.parquet + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_prefetch() { + // Explicitly specify a prefetch hint that is adequate for the footer and page index + let test = Test::new() + .with_parquet_metadata_size_hint(Some(1000)) + .with_single_file_parquet() + .await; + // expect 1 1000 byte request which reads the footer metadata and page index + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=parquet_table.parquet + - GET (range) range=1994-2994 path=parquet_table.parquet + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_too_small_prefetch() { + // configure a prefetch size that is too small to fetch the footer + // metadata + // + // Using the ranges from the test below (with no_prefetch), + // pick a number less than 730: + // -------- + // 2286-2294: (8 bytes) footer + length + // 2264-2986: (722 bytes) footer metadata + let test = Test::new() + .with_parquet_metadata_size_hint(Some(500)) + .with_single_file_parquet() + .await; + // expect three get requests: + // 1. read the footer (500 bytes per hint, not enough for the footer metadata) + // 2. Read the footer metadata + // 3. reads the PageIndex + assert_snapshot!( + test.requests(), @r" RequestCountingObjectStore() Total Requests: 4 - HEAD path=parquet_table.parquet - - GET (range) range=2986-2994 path=parquet_table.parquet + - GET (range) range=2494-2994 path=parquet_table.parquet - GET (range) range=2264-2986 path=parquet_table.parquet - GET (range) range=2124-2264 path=parquet_table.parquet " ); } +#[tokio::test] +async fn create_single_parquet_file_small_prefetch() { + // configure a prefetch size that is large enough for the footer + // metadata but **not** the PageIndex + // + // Using the ranges from the test below (with no_prefetch), + // the 730 is determined as follows; + // -------- + // 2286-2294: (8 bytes) footer + length + // 2264-2986: (722 bytes) footer metadata + let test = Test::new() + // 740 is enough to get both the footer + length (8 bytes) + // but not the entire PageIndex + .with_parquet_metadata_size_hint(Some(740)) + .with_single_file_parquet() + .await; + // expect two get requests: + // 1. read the footer metadata + // 2. reads the PageIndex + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 3 + - HEAD path=parquet_table.parquet + - GET (range) range=2254-2994 path=parquet_table.parquet + - GET (range) range=2124-2264 path=parquet_table.parquet + " + ); +} + +#[tokio::test] +async fn create_single_parquet_file_no_prefetch() { + let test = Test::new() + // force no prefetch by setting size hint to None + .with_parquet_metadata_size_hint(None) + .with_single_file_parquet() + .await; + // Without a metadata size hint, the parquet reader + // does *three* range requests to read the footer metadata: + // 1. The footer length (last 8 bytes) + // 2. The footer metadata + // 3. The PageIndex metadata + assert_snapshot!( + test.requests(), + @r" + RequestCountingObjectStore() + Total Requests: 2 + - HEAD path=parquet_table.parquet + - GET (range) range=0-2994 path=parquet_table.parquet + " + ); +} + #[tokio::test] async fn query_single_parquet_file() { + let test = Test::new().with_single_file_parquet().await; assert_snapshot!( - single_file_parquet_test().await.query("select count(distinct a), count(b) from parquet_table").await, + test.query("select count(distinct a), count(b) from parquet_table").await, @r" ------- Query Output (1 rows) ------- +---------------------------------+------------------------+ @@ -157,10 +269,11 @@ async fn query_single_parquet_file() { #[tokio::test] async fn query_single_parquet_file_with_single_predicate() { + let test = Test::new().with_single_file_parquet().await; // Note that evaluating predicates requires additional object store requests // (to evaluate predicates) assert_snapshot!( - single_file_parquet_test().await.query("select min(a), max(b) from parquet_table WHERE a > 150").await, + test.query("select min(a), max(b) from parquet_table WHERE a > 150").await, @r" ------- Query Output (1 rows) ------- +----------------------+----------------------+ @@ -179,10 +292,12 @@ async fn query_single_parquet_file_with_single_predicate() { #[tokio::test] async fn query_single_parquet_file_multi_row_groups_multiple_predicates() { + let test = Test::new().with_single_file_parquet().await; + // Note that evaluating predicates requires additional object store requests // (to evaluate predicates) assert_snapshot!( - single_file_parquet_test().await.query("select min(a), max(b) from parquet_table WHERE a > 50 AND b < 1150").await, + test.query("select min(a), max(b) from parquet_table WHERE a > 50 AND b < 1150").await, @r" ------- Query Output (1 rows) ------- +----------------------+----------------------+ @@ -200,75 +315,16 @@ async fn query_single_parquet_file_multi_row_groups_multiple_predicates() { ); } -/// Create a test with a single CSV file with three columns and two rows -async fn single_file_csv_test() -> Test { - // upload CSV data to object store - let csv_data = r#"c1,c2,c3 -0.00001,5e-12,true -0.00002,4e-12,false -"#; - - Test::new() - .with_bytes("/csv_table.csv", csv_data) - .await - .register_csv("csv_table", "/csv_table.csv") - .await -} - -/// Create a test with three CSV files in a directory -async fn multi_file_csv_test() -> Test { - let mut test = Test::new(); - // upload CSV data to object store - for i in 0..3 { - let csv_data1 = format!( - r#"c1,c2,c3 -0.0000{i},{i}e-12,true -0.00003,5e-12,false -"# - ); - test = test - .with_bytes(&format!("/data/file_{i}.csv"), csv_data1) - .await; - } - // register table - test.register_csv("csv_table", "/data/").await -} - -/// Create a test with a single parquet file that has two -/// columns and two row groups -/// -/// Column "a": Int32 with values 0-100] in row group 1 -/// and [101-200] in row group 2 -/// -/// Column "b": Int32 with values 1000-1100] in row group 1 -/// and [1101-1200] in row group 2 -async fn single_file_parquet_test() -> Test { - // Create parquet bytes - let a: ArrayRef = Arc::new(Int32Array::from_iter_values(0..200)); - let b: ArrayRef = Arc::new(Int32Array::from_iter_values(1000..1200)); - let batch = RecordBatch::try_from_iter([("a", a), ("b", b)]).unwrap(); - - let mut buffer = vec![]; - let props = parquet::file::properties::WriterProperties::builder() - .set_max_row_group_size(100) - .build(); - let mut writer = - parquet::arrow::ArrowWriter::try_new(&mut buffer, batch.schema(), Some(props)) - .unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); - - Test::new() - .with_bytes("/parquet_table.parquet", buffer) - .await - .register_parquet("parquet_table", "/parquet_table.parquet") - .await -} - /// Runs tests with a request counting object store struct Test { object_store: Arc, session_context: SessionContext, + /// metadata size hint to use when registering parquet files + /// + /// * `None`: uses the default (does not set a size_hint) + /// * `Some(None)`L: set prefetch hint to None (prefetching) + /// * `Some(Some(size))`: set prefetch hint to size + parquet_metadata_size_hint: Option>, } impl Test { @@ -281,9 +337,16 @@ impl Test { Self { object_store, session_context, + parquet_metadata_size_hint: None, } } + /// Specify the metadata size hint to use when registering parquet files + fn with_parquet_metadata_size_hint(mut self, size_hint: Option) -> Self { + self.parquet_metadata_size_hint = Some(size_hint); + self + } + /// Returns a string representation of all recorded requests thus far fn requests(&self) -> String { format!("{}", self.object_store) @@ -312,16 +375,88 @@ impl Test { self } - /// Register a CSV file at the given path relative to the [`datafusion_test_data`] directory + /// Register a Parquet file at the given path relative to the + /// [`datafusion_test_data`] directory async fn register_parquet(self, table_name: &str, path: &str) -> Self { let path = format!("mem://{path}"); + let mut options: ParquetReadOptions<'_> = ParquetReadOptions::new(); + + // If a metadata size hint was specified, apply it + if let Some(parquet_metadata_size_hint) = self.parquet_metadata_size_hint { + options = options.metadata_size_hint(parquet_metadata_size_hint); + } + self.session_context - .register_parquet(table_name, path, Default::default()) + .register_parquet(table_name, path, options) .await .unwrap(); self } + /// Register a single CSV file with three columns and two row named + /// `csv_table` + async fn with_single_file_csv(self) -> Test { + // upload CSV data to object store + let csv_data = r#"c1,c2,c3 +0.00001,5e-12,true +0.00002,4e-12,false +"#; + self.with_bytes("/csv_table.csv", csv_data) + .await + .register_csv("csv_table", "/csv_table.csv") + .await + } + + /// Register three CSV files in a directory, called `csv_table` + async fn with_multi_file_csv(mut self) -> Test { + // upload CSV data to object store + for i in 0..3 { + let csv_data1 = format!( + r#"c1,c2,c3 +0.0000{i},{i}e-12,true +0.00003,5e-12,false +"# + ); + self = self + .with_bytes(&format!("/data/file_{i}.csv"), csv_data1) + .await; + } + // register table + self.register_csv("csv_table", "/data/").await + } + + /// Add a single parquet file that has two columns and two row groups named `parquet_table` + /// + /// Column "a": Int32 with values 0-100] in row group 1 + /// and [101-200] in row group 2 + /// + /// Column "b": Int32 with values 1000-1100] in row group 1 + /// and [1101-1200] in row group 2 + async fn with_single_file_parquet(self) -> Test { + // Create parquet bytes + let a: ArrayRef = Arc::new(Int32Array::from_iter_values(0..200)); + let b: ArrayRef = Arc::new(Int32Array::from_iter_values(1000..1200)); + let batch = RecordBatch::try_from_iter([("a", a), ("b", b)]).unwrap(); + + let mut buffer = vec![]; + let props = parquet::file::properties::WriterProperties::builder() + .set_max_row_group_size(100) + .build(); + let mut writer = parquet::arrow::ArrowWriter::try_new( + &mut buffer, + batch.schema(), + Some(props), + ) + .unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + self.with_bytes("/parquet_table.parquet", buffer) + .await + .register_parquet("parquet_table", "/parquet_table.parquet") + .await + } + /// Runs the specified query and returns a string representation of the results /// suitable for comparison with insta snapshots /// diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index b15ec026372d..f1cc4c7a0cc9 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -246,7 +246,7 @@ datafusion.execution.parquet.max_predicate_cache_size NULL datafusion.execution.parquet.max_row_group_size 1048576 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 -datafusion.execution.parquet.metadata_size_hint NULL +datafusion.execution.parquet.metadata_size_hint 524288 datafusion.execution.parquet.pruning true datafusion.execution.parquet.pushdown_filters false datafusion.execution.parquet.reorder_filters false @@ -366,7 +366,7 @@ datafusion.execution.parquet.max_predicate_cache_size NULL (reading) The maximum datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. -datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer +datafusion.execution.parquet.metadata_size_hint 524288 (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. datafusion.execution.parquet.pruning true (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index fbf55a56057b..7ca5eb8f7be4 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -81,7 +81,7 @@ The following configuration settings are available: | datafusion.execution.parquet.enable_page_index | true | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. | | datafusion.execution.parquet.pruning | true | (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file | | datafusion.execution.parquet.skip_metadata | true | (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata | -| datafusion.execution.parquet.metadata_size_hint | NULL | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer | +| datafusion.execution.parquet.metadata_size_hint | 524288 | (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer Default setting to 512 KiB, which should be sufficient for most parquet files, it can reduce one I/O operation per parquet file. If the metadata is larger than the hint, two reads will still be performed. | | datafusion.execution.parquet.pushdown_filters | false | (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". | | datafusion.execution.parquet.reorder_filters | false | (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query | | datafusion.execution.parquet.schema_force_view_types | true | (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. | From 32b1fe3eef7839d70a371c0927d003740e734289 Mon Sep 17 00:00:00 2001 From: Michael Kleen Date: Tue, 28 Oct 2025 12:38:06 +0100 Subject: [PATCH 034/157] Fix: Add projection to generate_series (#18298) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes https://github.com/apache/datafusion/issues/17830 ## Rationale for this change The queries from the original ticket fail, because an unprojected `generate_series` function would produce in a join the wrong number of columns which leads to a runtime error. ## What changes are included in this PR? This adds a missing projection to `generate_series` to ensure values are only emitted when projected. ## Are these changes tested? I added a sql-logic test. I also compared the results against Postgres and DuckDB: Postgres: ```sql mkleen=# SELECT v1 FROM (select generate_series as v1 from generate_series(1, 3)) g1, (select generate_series as v2 from generate_series(1, 3)) g2; v1 ---- 1 1 1 2 2 2 3 3 3 (9 rows) ``` DuckDB: ```sql D SELECT v1 FROM (select generate_series as v1 from generate_series(1, 3)) g1, (select generate_series as v2 from generate_series(1, 3)) g2; ┌───────┐ │ v1 │ │ int64 │ ├───────┤ │ 1 │ │ 2 │ │ 3 │ │ 1 │ │ 2 │ │ 3 │ │ 1 │ │ 2 │ │ 3 │ └───────┘ ``` ## Are there any user-facing changes? No --- datafusion/functions-table/src/generate_series.rs | 13 +++++++++++-- datafusion/proto/src/physical_plan/mod.rs | 3 ++- .../sqllogictest/test_files/table_functions.slt | 15 +++++++++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/datafusion/functions-table/src/generate_series.rs b/datafusion/functions-table/src/generate_series.rs index d00f3d734d76..c66e652147eb 100644 --- a/datafusion/functions-table/src/generate_series.rs +++ b/datafusion/functions-table/src/generate_series.rs @@ -237,6 +237,7 @@ impl GenerateSeriesTable { pub fn as_generator( &self, batch_size: usize, + projection: Option>, ) -> Result>> { let generator: Arc> = match &self.args { GenSeriesArgs::ContainsNull { name } => Arc::new(RwLock::new(Empty { name })), @@ -255,6 +256,7 @@ impl GenerateSeriesTable { batch_size, include_end: *include_end, name, + projection, })), GenSeriesArgs::TimestampArgs { start, @@ -295,6 +297,7 @@ impl GenerateSeriesTable { batch_size, include_end: *include_end, name, + projection, })) } GenSeriesArgs::DateArgs { @@ -324,6 +327,7 @@ impl GenerateSeriesTable { batch_size, include_end: *include_end, name, + projection, })), }; @@ -341,6 +345,7 @@ pub struct GenericSeriesState { current: T, include_end: bool, name: &'static str, + projection: Option>, } impl GenericSeriesState { @@ -396,7 +401,11 @@ impl LazyBatchGenerator for GenericSeriesState { let array = self.current.create_array(buf)?; let batch = RecordBatch::try_new(Arc::clone(&self.schema), vec![array])?; - Ok(Some(batch)) + let projected = match self.projection.as_ref() { + Some(projection) => batch.project(projection)?, + None => batch, + }; + Ok(Some(projected)) } } @@ -477,7 +486,7 @@ impl TableProvider for GenerateSeriesTable { None => self.schema(), }; - let generator = self.as_generator(batch_size)?; + let generator = self.as_generator(batch_size, projection.cloned())?; Ok(Arc::new(LazyMemoryExec::try_new(schema, vec![generator])?)) } diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index e5f4a1f7d026..0ebbb373f2d1 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -1940,7 +1940,8 @@ impl protobuf::PhysicalPlanNode { }; let table = GenerateSeriesTable::new(Arc::clone(&schema), args); - let generator = table.as_generator(generate_series.target_batch_size as usize)?; + let generator = + table.as_generator(generate_series.target_batch_size as usize, None)?; Ok(Arc::new(LazyMemoryExec::try_new(schema, vec![generator])?)) } diff --git a/datafusion/sqllogictest/test_files/table_functions.slt b/datafusion/sqllogictest/test_files/table_functions.slt index 484004c14e03..57b83b6d3e85 100644 --- a/datafusion/sqllogictest/test_files/table_functions.slt +++ b/datafusion/sqllogictest/test_files/table_functions.slt @@ -188,6 +188,21 @@ SELECT generate_series(1, t1.end) FROM generate_series(3, 5) as t1(end) [1, 2, 3, 4] [1, 2, 3] +# join with projection on generate_series +query I +select g1.value from generate_series(1, 3) g1 CROSS JOIN generate_series(1, 3) g2; +---- +1 +1 +1 +2 +2 +2 +3 +3 +3 + + # Test range table function query I SELECT * FROM range(6) From 3edb38a5e126d9588b4067bdf9e90978d7c686d6 Mon Sep 17 00:00:00 2001 From: gene-bordegaray Date: Tue, 28 Oct 2025 07:40:16 -0400 Subject: [PATCH 035/157] fix: Add WITH ORDER display in information_schema.views (#18282) ## Which issue does this PR close? - Closes #18267. /cc @NGA-TRAN ## Rationale for this change The `information_schema.views` does not have display `WITH ORDER` for the definition of a table. ## What changes are included in this PR? Added condition for writing `WITH ORDER` for CreateExternalTable. ## Are these changes tested? Did not add tests for this functionality as not other display functionality has tests and seems like a separate PR would be appropriate if this is needed. This was tested manually with: In `datafusion-cli` ``` -- Not sorted CREATE EXTERNAL TABLE dimension_csv STORED AS CSV LOCATION '/path/to/the/attached/dimension_1.csv' OPTIONS ('format.has_header' 'true'); -- Sorted CREATE EXTERNAL TABLE dimension_csv_sorted STORED AS CSV WITH ORDER (env, service, host) LOCATION '/path/to/the/attached/dimension_1.csv' OPTIONS ('format.has_header' 'true'); ``` Then running: ``` select * from information_schema.views; ``` With link to data: [dimension_1.csv](https://github.com/user-attachments/files/23124138/dimension_1.csv) ## Are there any user-facing changes? Yes, improves the information_schema.views display to include `WITH ORDER` --- datafusion/sql/src/parser.rs | 14 +++++- .../test_files/information_schema.slt | 48 +++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 271ad8a856b4..1f1ef2a672ab 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -243,7 +243,19 @@ impl fmt::Display for CreateExternalTable { } write!(f, "{} ", self.name)?; write!(f, "STORED AS {} ", self.file_type)?; - write!(f, "LOCATION {} ", self.location) + if !self.order_exprs.is_empty() { + write!(f, "WITH ORDER (")?; + let mut first = true; + for expr in self.order_exprs.iter().flatten() { + if !first { + write!(f, ", ")?; + } + write!(f, "{expr}")?; + first = false; + } + write!(f, ") ")?; + } + write!(f, "LOCATION {}", self.location) } } diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index f1cc4c7a0cc9..c67405715149 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -705,6 +705,54 @@ SHOW CREATE TABLE abc; ---- datafusion public abc CREATE EXTERNAL TABLE abc STORED AS CSV LOCATION ../../testing/data/csv/aggregate_test_100.csv +# show_external_create_table_with_order +statement ok +CREATE EXTERNAL TABLE abc_ordered +STORED AS CSV +WITH ORDER (c1) +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); + +query TTTT +SHOW CREATE TABLE abc_ordered; +---- +datafusion public abc_ordered CREATE EXTERNAL TABLE abc_ordered STORED AS CSV WITH ORDER (c1) LOCATION ../../testing/data/csv/aggregate_test_100.csv + +statement ok +DROP TABLE abc_ordered; + +# show_external_create_table_with_multiple_order_columns +statement ok +CREATE EXTERNAL TABLE abc_multi_order +STORED AS CSV +WITH ORDER (c1, c2 DESC) +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); + +query TTTT +SHOW CREATE TABLE abc_multi_order; +---- +datafusion public abc_multi_order CREATE EXTERNAL TABLE abc_multi_order STORED AS CSV WITH ORDER (c1, c2 DESC) LOCATION ../../testing/data/csv/aggregate_test_100.csv + +statement ok +DROP TABLE abc_multi_order; + +# show_external_create_table_with_order_nulls +statement ok +CREATE EXTERNAL TABLE abc_order_nulls +STORED AS CSV +WITH ORDER (c1 NULLS LAST, c2 DESC NULLS FIRST) +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); + +query TTTT +SHOW CREATE TABLE abc_order_nulls; +---- +datafusion public abc_order_nulls CREATE EXTERNAL TABLE abc_order_nulls STORED AS CSV WITH ORDER (c1 NULLS LAST, c2 DESC NULLS FIRST) LOCATION ../../testing/data/csv/aggregate_test_100.csv + +statement ok +DROP TABLE abc_order_nulls; + # string_agg has different arg_types but same return type. Test avoiding duplicate entries for the same function. query TTT select routine_name, data_type, function_type from information_schema.routines where routine_name = 'string_agg'; From 63b4c8492367d40c258fa355587ba2cc785da53b Mon Sep 17 00:00:00 2001 From: Marc Brinkmann Date: Tue, 28 Oct 2025 16:18:47 +0100 Subject: [PATCH 036/157] Do not accept null is_set for first_value/last_value (#18301) ## Which issue does this PR close? - Closes #18300 ## Rationale for this change As laid out in the issue, this improves internal checks by testing an assumed invariant, instead of silently nulling data on error. The cost is a single null check on a column with a number of entries dependent on the number of partitions, not the data itself. ## What changes are included in this PR? * Adds a null check to the second column of `merge_batch` of both `FIRST_VALUE` and `LAST_VALUE`. ## Are these changes tested? Tests are included. ## Are there any user-facing changes? Hopefully not. --- .../functions-aggregate/src/first_last.rs | 106 +++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index b2a40ff50bd7..73f2ec112ffc 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -817,6 +817,8 @@ impl Accumulator for TrivialFirstValueAccumulator { // Second index contains is_set flag. if !self.is_set { let flags = states[1].as_boolean(); + validate_is_set_flags(flags, "first_value")?; + let filtered_states = filter_states_according_to_is_set(&states[0..1], flags)?; if let Some(first) = filtered_states.first() { @@ -962,6 +964,8 @@ impl Accumulator for FirstValueAccumulator { // last index contains is_set flag. let is_set_idx = states.len() - 1; let flags = states[is_set_idx].as_boolean(); + validate_is_set_flags(flags, "first_value")?; + let filtered_states = filter_states_according_to_is_set(&states[0..is_set_idx], flags)?; // 1..is_set_idx range corresponds to ordering section @@ -1299,6 +1303,8 @@ impl Accumulator for TrivialLastValueAccumulator { // LAST_VALUE(last1, last2, last3, ...) // Second index contains is_set flag. let flags = states[1].as_boolean(); + validate_is_set_flags(flags, "last_value")?; + let filtered_states = filter_states_according_to_is_set(&states[0..1], flags)?; if let Some(last) = filtered_states.last() { if !last.is_empty() { @@ -1444,6 +1450,8 @@ impl Accumulator for LastValueAccumulator { // last index contains is_set flag. let is_set_idx = states.len() - 1; let flags = states[is_set_idx].as_boolean(); + validate_is_set_flags(flags, "last_value")?; + let filtered_states = filter_states_according_to_is_set(&states[0..is_set_idx], flags)?; // 1..is_set_idx range corresponds to ordering section @@ -1487,6 +1495,16 @@ impl Accumulator for LastValueAccumulator { } } +/// Validates that `is_set flags` do not contain NULL values. +fn validate_is_set_flags(flags: &BooleanArray, function_name: &str) -> Result<()> { + if flags.null_count() > 0 { + return Err(DataFusionError::Internal(format!( + "{function_name}: is_set flags contain nulls" + ))); + } + Ok(()) +} + /// Filters states according to the `is_set` flag at the last column and returns /// the resulting states. fn filter_states_according_to_is_set( @@ -1515,7 +1533,7 @@ mod tests { use std::iter::repeat_with; use arrow::{ - array::{Int64Array, ListArray}, + array::{BooleanArray, Int64Array, ListArray, StringArray}, compute::SortOptions, datatypes::Schema, }; @@ -1928,4 +1946,90 @@ mod tests { Ok(()) } + + #[test] + fn test_first_value_merge_with_is_set_nulls() -> Result<()> { + // Test data with corrupted is_set flag + let value = Arc::new(StringArray::from(vec![Some("first_string")])) as ArrayRef; + let corrupted_flag = Arc::new(BooleanArray::from(vec![None])) as ArrayRef; + + // Test TrivialFirstValueAccumulator + let mut trivial_accumulator = + TrivialFirstValueAccumulator::try_new(&DataType::Utf8, false)?; + let trivial_states = vec![Arc::clone(&value), Arc::clone(&corrupted_flag)]; + let result = trivial_accumulator.merge_batch(&trivial_states); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("is_set flags contain nulls")); + + // Test FirstValueAccumulator (with ordering) + let schema = Schema::new(vec![Field::new("ordering", DataType::Int64, false)]); + let ordering_expr = col("ordering", &schema)?; + let mut ordered_accumulator = FirstValueAccumulator::try_new( + &DataType::Utf8, + &[DataType::Int64], + LexOrdering::new(vec![PhysicalSortExpr { + expr: ordering_expr, + options: SortOptions::default(), + }]) + .unwrap(), + false, + false, + )?; + let ordering = Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef; + let ordered_states = vec![value, ordering, corrupted_flag]; + let result = ordered_accumulator.merge_batch(&ordered_states); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("is_set flags contain nulls")); + + Ok(()) + } + + #[test] + fn test_last_value_merge_with_is_set_nulls() -> Result<()> { + // Test data with corrupted is_set flag + let value = Arc::new(StringArray::from(vec![Some("last_string")])) as ArrayRef; + let corrupted_flag = Arc::new(BooleanArray::from(vec![None])) as ArrayRef; + + // Test TrivialLastValueAccumulator + let mut trivial_accumulator = + TrivialLastValueAccumulator::try_new(&DataType::Utf8, false)?; + let trivial_states = vec![Arc::clone(&value), Arc::clone(&corrupted_flag)]; + let result = trivial_accumulator.merge_batch(&trivial_states); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("is_set flags contain nulls")); + + // Test LastValueAccumulator (with ordering) + let schema = Schema::new(vec![Field::new("ordering", DataType::Int64, false)]); + let ordering_expr = col("ordering", &schema)?; + let mut ordered_accumulator = LastValueAccumulator::try_new( + &DataType::Utf8, + &[DataType::Int64], + LexOrdering::new(vec![PhysicalSortExpr { + expr: ordering_expr, + options: SortOptions::default(), + }]) + .unwrap(), + false, + false, + )?; + let ordering = Arc::new(Int64Array::from(vec![Some(1)])) as ArrayRef; + let ordered_states = vec![value, ordering, corrupted_flag]; + let result = ordered_accumulator.merge_batch(&ordered_states); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("is_set flags contain nulls")); + + Ok(()) + } } From cfeb8faa962a86a21bab13f6665842ea12062bb7 Mon Sep 17 00:00:00 2001 From: Pepijn Van Eeckhoudt Date: Tue, 28 Oct 2025 16:20:22 +0100 Subject: [PATCH 037/157] Optimize merging of partial case expression results (#18152) ## Which issue does this PR close? - Improvement in the context of https://github.com/apache/datafusion/issues/18075 - Continues on #17898 ## Rationale for this change Case evaluation currently uses `PhysicalExpr::evaluate_selection` for each branch of the case expression. This implementation is fine, but because `evaluate_selection` is not specific to the `case` logic we're missing some optimisation opportunities. The main consequence is that too much work is being done filtering record batches and scattering results. This PR introduces specialised filtering logic and result interleaving for case. A more detailed description and diagrams are available at https://github.com/apache/datafusion/issues/18075#issuecomment-3422326710 ## What changes are included in this PR? Rewrite the `case_when_no_expr` and `case_when_with_expr` evaluation loops to avoid as much unnecessary work as possible. In particular the remaining rows to be evaluated are retained across loop iterations. This allows the record batch that needs to be filtered to shrink as the loop is being evaluated which reduces the number of rows that needs to be refiltered. If a when predicate does not match any rows at all, filtering is avoided entirely. The final result is also not merged every loop iteration. Instead an index vector is constructed which is used to compose the final result once using a custom 'multi zip'/'interleave' like operation. ## Are these changes tested? Covered by existing unit tests and SLTs ## Are there any user-facing changes? No --- .../physical-expr/src/expressions/case.rs | 732 +++++++++++++++--- datafusion/sqllogictest/test_files/case.slt | 22 + 2 files changed, 636 insertions(+), 118 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index 2db599047bcd..0b4c3af1d9c5 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -15,25 +15,28 @@ // specific language governing permissions and limitations // under the License. +use super::{Column, Literal}; +use crate::expressions::case::ResultState::{Complete, Empty, Partial}; use crate::expressions::try_cast; use crate::PhysicalExpr; -use std::borrow::Cow; -use std::hash::Hash; -use std::{any::Any, sync::Arc}; - use arrow::array::*; use arrow::compute::kernels::zip::zip; -use arrow::compute::{and, and_not, is_null, not, nullif, or, prep_null_mask_filter}; -use arrow::datatypes::{DataType, Schema}; +use arrow::compute::{ + is_not_null, not, nullif, prep_null_mask_filter, FilterBuilder, FilterPredicate, +}; +use arrow::datatypes::{DataType, Schema, UInt32Type}; +use arrow::error::ArrowError; use datafusion_common::cast::as_boolean_array; use datafusion_common::{ exec_err, internal_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, }; use datafusion_expr::ColumnarValue; - -use super::{Column, Literal}; use datafusion_physical_expr_common::datum::compare_with_eq; use itertools::Itertools; +use std::borrow::Cow; +use std::fmt::{Debug, Formatter}; +use std::hash::Hash; +use std::{any::Any, sync::Arc}; type WhenThen = (Arc, Arc); @@ -98,7 +101,7 @@ pub struct CaseExpr { } impl std::fmt::Display for CaseExpr { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { write!(f, "CASE ")?; if let Some(e) = &self.expr { write!(f, "{e} ")?; @@ -122,6 +125,419 @@ fn is_cheap_and_infallible(expr: &Arc) -> bool { expr.as_any().is::() } +/// Creates a [FilterPredicate] from a boolean array. +fn create_filter(predicate: &BooleanArray) -> FilterPredicate { + let mut filter_builder = FilterBuilder::new(predicate); + // Always optimize the filter since we use them multiple times. + filter_builder = filter_builder.optimize(); + filter_builder.build() +} + +// This should be removed when https://github.com/apache/arrow-rs/pull/8693 +// is merged and becomes available. +fn filter_record_batch( + record_batch: &RecordBatch, + filter: &FilterPredicate, +) -> std::result::Result { + let filtered_columns = record_batch + .columns() + .iter() + .map(|a| filter_array(a, filter)) + .collect::, _>>()?; + // SAFETY: since we start from a valid RecordBatch, there's no need to revalidate the schema + // since the set of columns has not changed. + // The input column arrays all had the same length (since they're coming from a valid RecordBatch) + // and the filtering them with the same filter will produces a new set of arrays with identical + // lengths. + unsafe { + Ok(RecordBatch::new_unchecked( + record_batch.schema(), + filtered_columns, + filter.count(), + )) + } +} + +// This function exists purely to be able to use the same call style +// for `filter_record_batch` and `filter_array` at the point of use. +// When https://github.com/apache/arrow-rs/pull/8693 is available, replace +// both with method calls on `FilterPredicate`. +#[inline(always)] +fn filter_array( + array: &dyn Array, + filter: &FilterPredicate, +) -> std::result::Result { + filter.filter(array) +} + +/// Merges elements by index from a list of [`ArrayData`], creating a new [`ColumnarValue`] from +/// those values. +/// +/// Each element in `indices` is the index of an array in `values`. The `indices` array is processed +/// sequentially. The first occurrence of index value `n` will be mapped to the first +/// value of the array at index `n`. The second occurrence to the second value, and so on. +/// An index value where `PartialResultIndex::is_none` is `true` is used to indicate null values. +/// +/// # Implementation notes +/// +/// This algorithm is similar in nature to both `zip` and `interleave`, but there are some important +/// differences. +/// +/// In contrast to `zip`, this function supports multiple input arrays. Instead of a boolean +/// selection vector, an index array is to take values from the input arrays, and a special marker +/// value is used to indicate null values. +/// +/// In contrast to `interleave`, this function does not use pairs of indices. The values in +/// `indices` serve the same purpose as the first value in the pairs passed to `interleave`. +/// The index in the array is implicit and is derived from the number of times a particular array +/// index occurs. +/// The more constrained indexing mechanism used by this algorithm makes it easier to copy values +/// in contiguous slices. In the example below, the two subsequent elements from array `2` can be +/// copied in a single operation from the source array instead of copying them one by one. +/// Long spans of null values are also especially cheap because they do not need to be represented +/// in an input array. +/// +/// # Safety +/// +/// This function does not check that the number of occurrences of any particular array index matches +/// the length of the corresponding input array. If an array contains more values than required, the +/// spurious values will be ignored. If an array contains fewer values than necessary, this function +/// will panic. +/// +/// # Example +/// +/// ```text +/// ┌───────────┐ ┌─────────┐ ┌─────────┐ +/// │┌─────────┐│ │ None │ │ NULL │ +/// ││ A ││ ├─────────┤ ├─────────┤ +/// │└─────────┘│ │ 1 │ │ B │ +/// │┌─────────┐│ ├─────────┤ ├─────────┤ +/// ││ B ││ │ 0 │ merge(values, indices) │ A │ +/// │└─────────┘│ ├─────────┤ ─────────────────────────▶ ├─────────┤ +/// │┌─────────┐│ │ None │ │ NULL │ +/// ││ C ││ ├─────────┤ ├─────────┤ +/// │├─────────┤│ │ 2 │ │ C │ +/// ││ D ││ ├─────────┤ ├─────────┤ +/// │└─────────┘│ │ 2 │ │ D │ +/// └───────────┘ └─────────┘ └─────────┘ +/// values indices result +/// +/// ``` +fn merge(values: &[ArrayData], indices: &[PartialResultIndex]) -> Result { + #[cfg(debug_assertions)] + for ix in indices { + if let Some(index) = ix.index() { + assert!( + index < values.len(), + "Index out of bounds: {} >= {}", + index, + values.len() + ); + } + } + + let data_refs = values.iter().collect(); + let mut mutable = MutableArrayData::new(data_refs, true, indices.len()); + + // This loop extends the mutable array by taking slices from the partial results. + // + // take_offsets keeps track of how many values have been taken from each array. + let mut take_offsets = vec![0; values.len() + 1]; + let mut start_row_ix = 0; + loop { + let array_ix = indices[start_row_ix]; + + // Determine the length of the slice to take. + let mut end_row_ix = start_row_ix + 1; + while end_row_ix < indices.len() && indices[end_row_ix] == array_ix { + end_row_ix += 1; + } + let slice_length = end_row_ix - start_row_ix; + + // Extend mutable with either nulls or with values from the array. + match array_ix.index() { + None => mutable.extend_nulls(slice_length), + Some(index) => { + let start_offset = take_offsets[index]; + let end_offset = start_offset + slice_length; + mutable.extend(index, start_offset, end_offset); + take_offsets[index] = end_offset; + } + } + + if end_row_ix == indices.len() { + break; + } else { + // Set the start_row_ix for the next slice. + start_row_ix = end_row_ix; + } + } + + Ok(make_array(mutable.freeze())) +} + +/// An index into the partial results array that's more compact than `usize`. +/// +/// `u32::MAX` is reserved as a special 'none' value. This is used instead of +/// `Option` to keep the array of indices as compact as possible. +#[derive(Copy, Clone, PartialEq, Eq)] +struct PartialResultIndex { + index: u32, +} + +const NONE_VALUE: u32 = u32::MAX; + +impl PartialResultIndex { + /// Returns the 'none' placeholder value. + fn none() -> Self { + Self { index: NONE_VALUE } + } + + fn zero() -> Self { + Self { index: 0 } + } + + /// Creates a new partial result index. + /// + /// If the provided value is greater than or equal to `u32::MAX` + /// an error will be returned. + fn try_new(index: usize) -> Result { + let Ok(index) = u32::try_from(index) else { + return internal_err!("Partial result index exceeds limit"); + }; + + if index == NONE_VALUE { + return internal_err!("Partial result index exceeds limit"); + } + + Ok(Self { index }) + } + + /// Determines if this index is the 'none' placeholder value or not. + fn is_none(&self) -> bool { + self.index == NONE_VALUE + } + + /// Returns `Some(index)` if this value is not the 'none' placeholder, `None` otherwise. + fn index(&self) -> Option { + if self.is_none() { + None + } else { + Some(self.index as usize) + } + } +} + +impl Debug for PartialResultIndex { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if self.is_none() { + write!(f, "null") + } else { + write!(f, "{}", self.index) + } + } +} + +enum ResultState { + /// The final result is an array containing only null values. + Empty, + /// The final result needs to be computed by merging the data in `arrays`. + Partial { + // A `Vec` of partial results that should be merged. + // `partial_result_indices` contains indexes into this vec. + arrays: Vec, + // Indicates per result row from which array in `partial_results` a value should be taken. + indices: Vec, + }, + /// A single branch matched all input rows. When creating the final result, no further merging + /// of partial results is necessary. + Complete(ColumnarValue), +} + +/// A builder for constructing result arrays for CASE expressions. +/// +/// Rather than building a monolithic array containing all results, it maintains a set of +/// partial result arrays and a mapping that indicates for each row which partial array +/// contains the result value for that row. +/// +/// On finish(), the builder will merge all partial results into a single array if necessary. +/// If all rows evaluated to the same array, that array can be returned directly without +/// any merging overhead. +struct ResultBuilder { + data_type: DataType, + /// The number of rows in the final result. + row_count: usize, + state: ResultState, +} + +impl ResultBuilder { + /// Creates a new ResultBuilder that will produce arrays of the given data type. + /// + /// The `row_count` parameter indicates the number of rows in the final result. + fn new(data_type: &DataType, row_count: usize) -> Self { + Self { + data_type: data_type.clone(), + row_count, + state: Empty, + } + } + + /// Adds a result for one branch of the case expression. + /// + /// `row_indices` should be a [UInt32Array] containing [RecordBatch] relative row indices + /// for which `value` contains result values. + /// + /// If `value` is a scalar, the scalar value will be used as the value for each row in `row_indices`. + /// + /// If `value` is an array, the values from the array and the indices from `row_indices` will be + /// processed pairwise. The lengths of `value` and `row_indices` must match. + /// + /// The diagram below shows a situation where a when expression matched rows 1 and 4 of the + /// record batch. The then expression produced the value array `[A, D]`. + /// After adding this result, the result array will have been added to `partial arrays` and + /// `partial indices` will have been updated at indexes `1` and `4`. + /// + /// ```text + /// ┌─────────┐ ┌─────────┐┌───────────┐ ┌─────────┐┌───────────┐ + /// │ C │ │ 0: None ││┌ 0 ──────┐│ │ 0: None ││┌ 0 ──────┐│ + /// ├─────────┤ ├─────────┤││ A ││ ├─────────┤││ A ││ + /// │ D │ │ 1: None ││└─────────┘│ │ 1: 2 ││└─────────┘│ + /// └─────────┘ ├─────────┤│┌ 1 ──────┐│ add_branch_result( ├─────────┤│┌ 1 ──────┐│ + /// matching │ 2: 0 │││ B ││ row indices, │ 2: 0 │││ B ││ + /// 'then' values ├─────────┤│└─────────┘│ value ├─────────┤│└─────────┘│ + /// │ 3: None ││ │ ) │ 3: None ││┌ 2 ──────┐│ + /// ┌─────────┐ ├─────────┤│ │ ─────────────────────────▶ ├─────────┤││ C ││ + /// │ 1 │ │ 4: None ││ │ │ 4: 2 ││├─────────┤│ + /// ├─────────┤ ├─────────┤│ │ ├─────────┤││ D ││ + /// │ 4 │ │ 5: 1 ││ │ │ 5: 1 ││└─────────┘│ + /// └─────────┘ └─────────┘└───────────┘ └─────────┘└───────────┘ + /// row indices partial partial partial partial + /// indices arrays indices arrays + /// ``` + fn add_branch_result( + &mut self, + row_indices: &ArrayRef, + value: ColumnarValue, + ) -> Result<()> { + match value { + ColumnarValue::Array(a) => { + if a.len() != row_indices.len() { + internal_err!("Array length must match row indices length") + } else if row_indices.len() == self.row_count { + self.set_complete_result(ColumnarValue::Array(a)) + } else { + self.add_partial_result(row_indices, a.to_data()) + } + } + ColumnarValue::Scalar(s) => { + if row_indices.len() == self.row_count { + self.set_complete_result(ColumnarValue::Scalar(s)) + } else { + self.add_partial_result( + row_indices, + s.to_array_of_size(row_indices.len())?.to_data(), + ) + } + } + } + } + + /// Adds a partial result array. + /// + /// This method adds the given array data as a partial result and updates the index mapping + /// to indicate that the specified rows should take their values from this array. + /// The partial results will be merged into a single array when finish() is called. + fn add_partial_result( + &mut self, + row_indices: &ArrayRef, + row_values: ArrayData, + ) -> Result<()> { + if row_indices.null_count() != 0 { + return internal_err!("Row indices must not contain nulls"); + } + + match &mut self.state { + Empty => { + let array_index = PartialResultIndex::zero(); + let mut indices = vec![PartialResultIndex::none(); self.row_count]; + for row_ix in row_indices.as_primitive::().values().iter() { + indices[*row_ix as usize] = array_index; + } + + self.state = Partial { + arrays: vec![row_values], + indices, + }; + + Ok(()) + } + Partial { arrays, indices } => { + let array_index = PartialResultIndex::try_new(arrays.len())?; + + arrays.push(row_values); + + for row_ix in row_indices.as_primitive::().values().iter() { + // This is check is only active for debug config because the callers of this method, + // `case_when_with_expr` and `case_when_no_expr`, already ensure that + // they only calculate a value for each row at most once. + #[cfg(debug_assertions)] + if !indices[*row_ix as usize].is_none() { + return internal_err!("Duplicate value for row {}", *row_ix); + } + + indices[*row_ix as usize] = array_index; + } + Ok(()) + } + Complete(_) => internal_err!( + "Cannot add a partial result when complete result is already set" + ), + } + } + + /// Sets a result that applies to all rows. + /// + /// This is an optimization for cases where all rows evaluate to the same result. + /// When a complete result is set, the builder will return it directly from finish() + /// without any merging overhead. + fn set_complete_result(&mut self, value: ColumnarValue) -> Result<()> { + match &self.state { + Empty => { + self.state = Complete(value); + Ok(()) + } + Partial { .. } => { + internal_err!( + "Cannot set a complete result when there are already partial results" + ) + } + Complete(_) => internal_err!("Complete result already set"), + } + } + + /// Finishes building the result and returns the final array. + fn finish(self) -> Result { + match self.state { + Empty => { + // No complete result and no partial results. + // This can happen for case expressions with no else branch where no rows + // matched. + Ok(ColumnarValue::Scalar(ScalarValue::try_new_null( + &self.data_type, + )?)) + } + Partial { arrays, indices } => { + // Merge partial results into a single array. + Ok(ColumnarValue::Array(merge(&arrays, &indices)?)) + } + Complete(v) => { + // If we have a complete result, we can just return it. + Ok(v) + } + } + } +} + impl CaseExpr { /// Create a new CASE WHEN expression pub fn try_new( @@ -196,82 +612,146 @@ impl CaseExpr { /// END fn case_when_with_expr(&self, batch: &RecordBatch) -> Result { let return_type = self.data_type(&batch.schema())?; - let expr = self.expr.as_ref().unwrap(); - let base_value = expr.evaluate(batch)?; - let base_value = base_value.into_array(batch.num_rows())?; - let base_nulls = is_null(base_value.as_ref())?; - - // start with nulls as default output - let mut current_value = new_null_array(&return_type, batch.num_rows()); - // We only consider non-null values while comparing with whens - let mut remainder = not(&base_nulls)?; - let mut non_null_remainder_count = remainder.true_count(); - for i in 0..self.when_then_expr.len() { - // If there are no rows left to process, break out of the loop early - if non_null_remainder_count == 0 { - break; - } + let mut result_builder = ResultBuilder::new(&return_type, batch.num_rows()); + + // `remainder_rows` contains the indices of the rows that need to be evaluated + let mut remainder_rows: ArrayRef = + Arc::new(UInt32Array::from_iter_values(0..batch.num_rows() as u32)); + // `remainder_batch` contains the rows themselves that need to be evaluated + let mut remainder_batch = Cow::Borrowed(batch); + + // evaluate the base expression + let mut base_values = self + .expr + .as_ref() + .unwrap() + .evaluate(batch)? + .into_array(batch.num_rows())?; - let when_predicate = &self.when_then_expr[i].0; - let when_value = when_predicate.evaluate_selection(batch, &remainder)?; - let when_value = when_value.into_array(batch.num_rows())?; - // build boolean array representing which rows match the "when" value - let when_match = compare_with_eq( - &when_value, - &base_value, - // The types of case and when expressions will be coerced to match. - // We only need to check if the base_value is nested. - base_value.data_type().is_nested(), - )?; - // Treat nulls as false - let when_match = match when_match.null_count() { - 0 => Cow::Borrowed(&when_match), - _ => Cow::Owned(prep_null_mask_filter(&when_match)), - }; - // Make sure we only consider rows that have not been matched yet - let when_value = and(&when_match, &remainder)?; + // Fill in a result value already for rows where the base expression value is null + // Since each when expression is tested against the base expression using the equality + // operator, null base values can never match any when expression. `x = NULL` is falsy, + // for all possible values of `x`. + if base_values.null_count() > 0 { + // Use `is_not_null` since this is a cheap clone of the null buffer from 'base_value'. + // We already checked there are nulls, so we can be sure a new buffer will not be + // created. + let base_not_nulls = is_not_null(base_values.as_ref())?; + let base_all_null = base_values.null_count() == remainder_batch.num_rows(); + + // If there is an else expression, use that as the default value for the null rows + // Otherwise the default `null` value from the result builder will be used. + if let Some(e) = self.else_expr() { + let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?; - // If the predicate did not match any rows, continue to the next branch immediately - let when_match_count = when_value.true_count(); - if when_match_count == 0 { - continue; + if base_all_null { + // All base values were null, so no need to filter + let nulls_value = expr.evaluate(&remainder_batch)?; + result_builder.add_branch_result(&remainder_rows, nulls_value)?; + } else { + // Filter out the null rows and evaluate the else expression for those + let nulls_filter = create_filter(¬(&base_not_nulls)?); + let nulls_batch = + filter_record_batch(&remainder_batch, &nulls_filter)?; + let nulls_rows = filter_array(&remainder_rows, &nulls_filter)?; + let nulls_value = expr.evaluate(&nulls_batch)?; + result_builder.add_branch_result(&nulls_rows, nulls_value)?; + } } - let then_expression = &self.when_then_expr[i].1; - let then_value = then_expression.evaluate_selection(batch, &when_value)?; + // All base values are null, so we can return early + if base_all_null { + return result_builder.finish(); + } - current_value = match then_value { - ColumnarValue::Scalar(ScalarValue::Null) => { - nullif(current_value.as_ref(), &when_value)? - } - ColumnarValue::Scalar(then_value) => { - zip(&when_value, &then_value.to_scalar()?, ¤t_value)? + // Remove the null rows from the remainder batch + let not_null_filter = create_filter(&base_not_nulls); + remainder_batch = + Cow::Owned(filter_record_batch(&remainder_batch, ¬_null_filter)?); + remainder_rows = filter_array(&remainder_rows, ¬_null_filter)?; + base_values = filter_array(&base_values, ¬_null_filter)?; + } + + // The types of case and when expressions will be coerced to match. + // We only need to check if the base_value is nested. + let base_value_is_nested = base_values.data_type().is_nested(); + + for i in 0..self.when_then_expr.len() { + // Evaluate the 'when' predicate for the remainder batch + // This results in a boolean array with the same length as the remaining number of rows + let when_expr = &self.when_then_expr[i].0; + let when_value = match when_expr.evaluate(&remainder_batch)? { + ColumnarValue::Array(a) => { + compare_with_eq(&a, &base_values, base_value_is_nested) } - ColumnarValue::Array(then_value) => { - zip(&when_value, &then_value, ¤t_value)? + ColumnarValue::Scalar(s) => { + let scalar = Scalar::new(s.to_array()?); + compare_with_eq(&scalar, &base_values, base_value_is_nested) } - }; + }?; - remainder = and_not(&remainder, &when_value)?; - non_null_remainder_count -= when_match_count; - } + // `true_count` ignores `true` values where the validity bit is not set, so there's + // no need to call `prep_null_mask_filter`. + let when_true_count = when_value.true_count(); - if let Some(e) = self.else_expr() { - // null and unmatched tuples should be assigned else value - remainder = or(&base_nulls, &remainder)?; + // If the 'when' predicate did not match any rows, continue to the next branch immediately + if when_true_count == 0 { + continue; + } - if remainder.true_count() > 0 { - // keep `else_expr`'s data type and return type consistent - let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?; + // If the 'when' predicate matched all remaining rows, there is no need to filter + if when_true_count == remainder_batch.num_rows() { + let then_expression = &self.when_then_expr[i].1; + let then_value = then_expression.evaluate(&remainder_batch)?; + result_builder.add_branch_result(&remainder_rows, then_value)?; + return result_builder.finish(); + } + + // Filter the remainder batch based on the 'when' value + // This results in a batch containing only the rows that need to be evaluated + // for the current branch + // Still no need to call `prep_null_mask_filter` since `create_filter` will already do + // this unconditionally. + let then_filter = create_filter(&when_value); + let then_batch = filter_record_batch(&remainder_batch, &then_filter)?; + let then_rows = filter_array(&remainder_rows, &then_filter)?; - let else_ = expr - .evaluate_selection(batch, &remainder)? - .into_array(batch.num_rows())?; - current_value = zip(&remainder, &else_, ¤t_value)?; + let then_expression = &self.when_then_expr[i].1; + let then_value = then_expression.evaluate(&then_batch)?; + result_builder.add_branch_result(&then_rows, then_value)?; + + // If this is the last 'when' branch and there is no 'else' expression, there's no + // point in calculating the remaining rows. + if self.else_expr.is_none() && i == self.when_then_expr.len() - 1 { + return result_builder.finish(); } + + // Prepare the next when branch (or the else branch) + let next_selection = match when_value.null_count() { + 0 => not(&when_value), + _ => { + // `prep_null_mask_filter` is required to ensure the not operation treats nulls + // as false + not(&prep_null_mask_filter(&when_value)) + } + }?; + let next_filter = create_filter(&next_selection); + remainder_batch = + Cow::Owned(filter_record_batch(&remainder_batch, &next_filter)?); + remainder_rows = filter_array(&remainder_rows, &next_filter)?; + base_values = filter_array(&base_values, &next_filter)?; + } + + // If we reached this point, some rows were left unmatched. + // Check if those need to be evaluated using the 'else' expression. + if let Some(e) = self.else_expr() { + // keep `else_expr`'s data type and return type consistent + let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?; + let else_value = expr.evaluate(&remainder_batch)?; + result_builder.add_branch_result(&remainder_rows, else_value)?; } - Ok(ColumnarValue::Array(current_value)) + result_builder.finish() } /// This function evaluates the form of CASE where each WHEN expression is a boolean @@ -283,70 +763,86 @@ impl CaseExpr { /// END fn case_when_no_expr(&self, batch: &RecordBatch) -> Result { let return_type = self.data_type(&batch.schema())?; + let mut result_builder = ResultBuilder::new(&return_type, batch.num_rows()); - // start with nulls as default output - let mut current_value = new_null_array(&return_type, batch.num_rows()); - let mut remainder = BooleanArray::from(vec![true; batch.num_rows()]); - let mut remainder_count = batch.num_rows(); - for i in 0..self.when_then_expr.len() { - // If there are no rows left to process, break out of the loop early - if remainder_count == 0 { - break; - } + // `remainder_rows` contains the indices of the rows that need to be evaluated + let mut remainder_rows: ArrayRef = + Arc::new(UInt32Array::from_iter(0..batch.num_rows() as u32)); + // `remainder_batch` contains the rows themselves that need to be evaluated + let mut remainder_batch = Cow::Borrowed(batch); + for i in 0..self.when_then_expr.len() { + // Evaluate the 'when' predicate for the remainder batch + // This results in a boolean array with the same length as the remaining number of rows let when_predicate = &self.when_then_expr[i].0; - let when_value = when_predicate.evaluate_selection(batch, &remainder)?; - let when_value = when_value.into_array(batch.num_rows())?; + let when_value = when_predicate + .evaluate(&remainder_batch)? + .into_array(remainder_batch.num_rows())?; let when_value = as_boolean_array(&when_value).map_err(|_| { internal_datafusion_err!("WHEN expression did not return a BooleanArray") })?; - // Treat 'NULL' as false value - let when_value = match when_value.null_count() { - 0 => Cow::Borrowed(when_value), - _ => Cow::Owned(prep_null_mask_filter(when_value)), - }; - // Make sure we only consider rows that have not been matched yet - let when_value = and(&when_value, &remainder)?; - // If the predicate did not match any rows, continue to the next branch immediately - let when_match_count = when_value.true_count(); - if when_match_count == 0 { + // `true_count` ignores `true` values where the validity bit is not set, so there's + // no need to call `prep_null_mask_filter`. + let when_true_count = when_value.true_count(); + + // If the 'when' predicate did not match any rows, continue to the next branch immediately + if when_true_count == 0 { continue; } + // If the 'when' predicate matched all remaining rows, there is no need to filter + if when_true_count == remainder_batch.num_rows() { + let then_expression = &self.when_then_expr[i].1; + let then_value = then_expression.evaluate(&remainder_batch)?; + result_builder.add_branch_result(&remainder_rows, then_value)?; + return result_builder.finish(); + } + + // Filter the remainder batch based on the 'when' value + // This results in a batch containing only the rows that need to be evaluated + // for the current branch + // Still no need to call `prep_null_mask_filter` since `create_filter` will already do + // this unconditionally. + let then_filter = create_filter(when_value); + let then_batch = filter_record_batch(&remainder_batch, &then_filter)?; + let then_rows = filter_array(&remainder_rows, &then_filter)?; + let then_expression = &self.when_then_expr[i].1; - let then_value = then_expression.evaluate_selection(batch, &when_value)?; + let then_value = then_expression.evaluate(&then_batch)?; + result_builder.add_branch_result(&then_rows, then_value)?; - current_value = match then_value { - ColumnarValue::Scalar(ScalarValue::Null) => { - nullif(current_value.as_ref(), &when_value)? - } - ColumnarValue::Scalar(then_value) => { - zip(&when_value, &then_value.to_scalar()?, ¤t_value)? - } - ColumnarValue::Array(then_value) => { - zip(&when_value, &then_value, ¤t_value)? - } - }; + // If this is the last 'when' branch and there is no 'else' expression, there's no + // point in calculating the remaining rows. + if self.else_expr.is_none() && i == self.when_then_expr.len() - 1 { + return result_builder.finish(); + } - // Succeed tuples should be filtered out for short-circuit evaluation, - // null values for the current when expr should be kept - remainder = and_not(&remainder, &when_value)?; - remainder_count -= when_match_count; + // Prepare the next when branch (or the else branch) + let next_selection = match when_value.null_count() { + 0 => not(when_value), + _ => { + // `prep_null_mask_filter` is required to ensure the not operation treats nulls + // as false + not(&prep_null_mask_filter(when_value)) + } + }?; + let next_filter = create_filter(&next_selection); + remainder_batch = + Cow::Owned(filter_record_batch(&remainder_batch, &next_filter)?); + remainder_rows = filter_array(&remainder_rows, &next_filter)?; } + // If we reached this point, some rows were left unmatched. + // Check if those need to be evaluated using the 'else' expression. if let Some(e) = self.else_expr() { - if remainder_count > 0 { - // keep `else_expr`'s data type and return type consistent - let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?; - let else_ = expr - .evaluate_selection(batch, &remainder)? - .into_array(batch.num_rows())?; - current_value = zip(&remainder, &else_, ¤t_value)?; - } + // keep `else_expr`'s data type and return type consistent + let expr = try_cast(Arc::clone(e), &batch.schema(), return_type.clone())?; + let else_value = expr.evaluate(&remainder_batch)?; + result_builder.add_branch_result(&remainder_rows, else_value)?; } - Ok(ColumnarValue::Array(current_value)) + result_builder.finish() } /// This function evaluates the specialized case of: @@ -587,7 +1083,7 @@ impl PhysicalExpr for CaseExpr { } } - fn fmt_sql(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fn fmt_sql(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "CASE ")?; if let Some(e) = &self.expr { e.fmt_sql(f)?; diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt index 352300e753a7..4eaa87b0b516 100644 --- a/datafusion/sqllogictest/test_files/case.slt +++ b/datafusion/sqllogictest/test_files/case.slt @@ -595,3 +595,25 @@ SELECT CASE WHEN a = 'a' THEN 0 WHEN a = 'b' THEN 1 ELSE 2 END FROM (VALUES (NUL ---- 2 2 + +# The `WHEN 1/0` is not effectively reachable in this query and should never be executed +query T +SELECT CASE a WHEN 1 THEN 'a' WHEN 2 THEN 'b' WHEN 1 / 0 THEN 'c' ELSE 'd' END FROM (VALUES (1), (2)) t(a) +---- +a +b + +# The `WHEN 1/0` is not effectively reachable in this query and should never be executed +query T +SELECT CASE WHEN a = 1 THEN 'a' WHEN a = 2 THEN 'b' WHEN a = 1 / 0 THEN 'c' ELSE 'd' END FROM (VALUES (1), (2)) t(a) +---- +a +b + +# The `WHEN 1/0` is not effectively reachable in this query and should never be executed +query T +SELECT CASE WHEN a = 0 THEN 'a' WHEN 1 / a = 1 THEN 'b' ELSE 'c' END FROM (VALUES (0), (1), (2)) t(a) +---- +a +b +c From fe68e75243507d823bda4e2d72c7b2a47dddb0f3 Mon Sep 17 00:00:00 2001 From: Yu-Chuan Hung <86523891+CuteChuanChuan@users.noreply.github.com> Date: Wed, 29 Oct 2025 04:48:13 +0800 Subject: [PATCH 038/157] chore: Format examples in doc strings - execution (#18339) ## Which issue does this PR close? Part of #16915 ## Rationale for this change Format code examples in documentation comments to improve readability and maintain consistent code style across the codebase. This is part of a multi-PR effort to format all doc comment examples and eventually enable CI checks to enforce this formatting. ## What changes are included in this PR? Run `cargo fmt -p datafusion-execution -- --config format_code_in_doc_comments=true` ## Are these changes tested? No testing needed - this is purely a formatting change with no functional modifications. ## Are there any user-facing changes? No - this only affects documentation formatting. --- datafusion/execution/src/config.rs | 13 ++++++++----- datafusion/execution/src/memory_pool/pool.rs | 4 +++- datafusion/execution/src/runtime_env.rs | 6 +++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs index 491b1aca69ea..a0b180bf4020 100644 --- a/datafusion/execution/src/config.rs +++ b/datafusion/execution/src/config.rs @@ -44,12 +44,15 @@ use datafusion_common::{ /// shorthand for setting `datafusion.execution.batch_size`. /// /// ``` -/// use datafusion_execution::config::SessionConfig; /// use datafusion_common::ScalarValue; +/// use datafusion_execution::config::SessionConfig; /// /// let config = SessionConfig::new() -/// .set("datafusion.execution.batch_size", &ScalarValue::UInt64(Some(1234))) -/// .set_bool("datafusion.execution.parquet.pushdown_filters", true); +/// .set( +/// "datafusion.execution.batch_size", +/// &ScalarValue::UInt64(Some(1234)), +/// ) +/// .set_bool("datafusion.execution.parquet.pushdown_filters", true); /// /// assert_eq!(config.batch_size(), 1234); /// assert_eq!(config.options().execution.batch_size, 1234); @@ -502,8 +505,8 @@ impl SessionConfig { /// /// # Example /// ``` - /// use std::sync::Arc; /// use datafusion_execution::config::SessionConfig; + /// use std::sync::Arc; /// /// // application-specific extension types /// struct Ext1(u8); @@ -545,8 +548,8 @@ impl SessionConfig { /// /// # Example /// ``` - /// use std::sync::Arc; /// use datafusion_execution::config::SessionConfig; + /// use std::sync::Arc; /// /// // application-specific extension types /// struct Ext1(u8); diff --git a/datafusion/execution/src/memory_pool/pool.rs b/datafusion/execution/src/memory_pool/pool.rs index 306df3defdbb..d6b55182aa6b 100644 --- a/datafusion/execution/src/memory_pool/pool.rs +++ b/datafusion/execution/src/memory_pool/pool.rs @@ -346,8 +346,10 @@ impl TrackConsumersPool { /// # Example /// /// ```rust + /// use datafusion_execution::memory_pool::{ + /// FairSpillPool, GreedyMemoryPool, TrackConsumersPool, + /// }; /// use std::num::NonZeroUsize; - /// use datafusion_execution::memory_pool::{TrackConsumersPool, GreedyMemoryPool, FairSpillPool}; /// /// // Create with a greedy pool backend, reporting top 3 consumers in error messages /// let tracked_greedy = TrackConsumersPool::new( diff --git a/datafusion/execution/src/runtime_env.rs b/datafusion/execution/src/runtime_env.rs index b0d0a966b7a2..d69987600855 100644 --- a/datafusion/execution/src/runtime_env.rs +++ b/datafusion/execution/src/runtime_env.rs @@ -67,9 +67,9 @@ use url::Url; /// // restrict to using at most 100MB of memory /// let pool_size = 100 * 1024 * 1024; /// let runtime_env = RuntimeEnvBuilder::new() -/// .with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size))) -/// .build() -/// .unwrap(); +/// .with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size))) +/// .build() +/// .unwrap(); /// ``` pub struct RuntimeEnv { /// Runtime memory management From 469e9eca7f273f97e946ba5950f90a45f541aa73 Mon Sep 17 00:00:00 2001 From: Yu-Chuan Hung <86523891+CuteChuanChuan@users.noreply.github.com> Date: Wed, 29 Oct 2025 04:50:26 +0800 Subject: [PATCH 039/157] chore: Format examples in doc strings - common (#18336) ## Which issue does this PR close? Part of #16915 ## Rationale for this change Format code examples in documentation comments to improve readability and maintain consistent code style across the codebase. This is part of a multi-PR effort to format all doc comment examples and eventually enable CI checks to enforce this formatting. ## What changes are included in this PR? Run `cargo fmt -p datafusion-common -- --config format_code_in_doc_comments=true` ## Are these changes tested? No testing needed - this is purely a formatting change with no functional modifications. ## Are there any user-facing changes? No - this only affects documentation formatting. --- datafusion/common/src/config.rs | 47 +++--- datafusion/common/src/datatype.rs | 2 - datafusion/common/src/dfschema.rs | 40 ++--- datafusion/common/src/diagnostic.rs | 7 +- datafusion/common/src/error.rs | 17 +- datafusion/common/src/metadata.rs | 1 - datafusion/common/src/nested_struct.rs | 9 +- datafusion/common/src/scalar/mod.rs | 153 +++++++++--------- .../common/src/scalar/struct_builder.rs | 12 +- datafusion/common/src/stats.rs | 46 +++--- datafusion/common/src/table_reference.rs | 12 +- datafusion/common/src/test_util.rs | 2 +- datafusion/common/src/tree_node.rs | 13 +- datafusion/common/src/types/logical.rs | 12 +- datafusion/common/src/utils/memory.rs | 8 +- datafusion/common/src/utils/mod.rs | 43 ++--- datafusion/common/src/utils/proxy.rs | 16 +- 17 files changed, 232 insertions(+), 208 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 10199db1a1de..bc321b227ee5 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -57,7 +57,7 @@ use std::sync::Arc; /// /// Field 3 doc /// field3: Option, default = None /// } -///} +/// } /// ``` /// /// Will generate @@ -1326,36 +1326,35 @@ impl ConfigOptions { /// # Example /// ``` /// use datafusion_common::{ -/// config::ConfigExtension, extensions_options, -/// config::ConfigOptions, +/// config::ConfigExtension, config::ConfigOptions, extensions_options, /// }; -/// // Define a new configuration struct using the `extensions_options` macro -/// extensions_options! { -/// /// My own config options. -/// pub struct MyConfig { -/// /// Should "foo" be replaced by "bar"? -/// pub foo_to_bar: bool, default = true +/// // Define a new configuration struct using the `extensions_options` macro +/// extensions_options! { +/// /// My own config options. +/// pub struct MyConfig { +/// /// Should "foo" be replaced by "bar"? +/// pub foo_to_bar: bool, default = true /// -/// /// How many "baz" should be created? -/// pub baz_count: usize, default = 1337 -/// } -/// } +/// /// How many "baz" should be created? +/// pub baz_count: usize, default = 1337 +/// } +/// } /// -/// impl ConfigExtension for MyConfig { +/// impl ConfigExtension for MyConfig { /// const PREFIX: &'static str = "my_config"; -/// } +/// } /// -/// // set up config struct and register extension -/// let mut config = ConfigOptions::default(); -/// config.extensions.insert(MyConfig::default()); +/// // set up config struct and register extension +/// let mut config = ConfigOptions::default(); +/// config.extensions.insert(MyConfig::default()); /// -/// // overwrite config default -/// config.set("my_config.baz_count", "42").unwrap(); +/// // overwrite config default +/// config.set("my_config.baz_count", "42").unwrap(); /// -/// // check config state -/// let my_config = config.extensions.get::().unwrap(); -/// assert!(my_config.foo_to_bar,); -/// assert_eq!(my_config.baz_count, 42,); +/// // check config state +/// let my_config = config.extensions.get::().unwrap(); +/// assert!(my_config.foo_to_bar,); +/// assert_eq!(my_config.baz_count, 42,); /// ``` /// /// # Note: diff --git a/datafusion/common/src/datatype.rs b/datafusion/common/src/datatype.rs index 544ec0c2468c..65f639521186 100644 --- a/datafusion/common/src/datatype.rs +++ b/datafusion/common/src/datatype.rs @@ -81,7 +81,6 @@ pub trait FieldExt { /// assert_eq!(list_field.data_type(), &DataType::List(Arc::new( /// Field::new("item", DataType::Int32, true) /// ))); - /// fn into_list(self) -> Self; /// Return a new Field representing this Field as the item type of a @@ -107,7 +106,6 @@ pub trait FieldExt { /// Field::new("item", DataType::Int32, true)), /// 3 /// )); - /// fn into_fixed_size_list(self, list_size: i32) -> Self; /// Update the field to have the default list field name ("item") diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 34a36f543657..24d152a7dba8 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -56,12 +56,10 @@ pub type DFSchemaRef = Arc; /// an Arrow schema. /// /// ```rust -/// use datafusion_common::{DFSchema, Column}; /// use arrow::datatypes::{DataType, Field, Schema}; +/// use datafusion_common::{Column, DFSchema}; /// -/// let arrow_schema = Schema::new(vec![ -/// Field::new("c1", DataType::Int32, false), -/// ]); +/// let arrow_schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); /// /// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema).unwrap(); /// let column = Column::from_qualified_name("t1.c1"); @@ -77,12 +75,10 @@ pub type DFSchemaRef = Arc; /// Create an unqualified schema using TryFrom: /// /// ```rust -/// use datafusion_common::{DFSchema, Column}; /// use arrow::datatypes::{DataType, Field, Schema}; +/// use datafusion_common::{Column, DFSchema}; /// -/// let arrow_schema = Schema::new(vec![ -/// Field::new("c1", DataType::Int32, false), -/// ]); +/// let arrow_schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); /// /// let df_schema = DFSchema::try_from(arrow_schema).unwrap(); /// let column = Column::new_unqualified("c1"); @@ -94,13 +90,15 @@ pub type DFSchemaRef = Arc; /// Use the `Into` trait to convert `DFSchema` into an Arrow schema: /// /// ```rust +/// use arrow::datatypes::{Field, Schema}; /// use datafusion_common::DFSchema; -/// use arrow::datatypes::{Schema, Field}; /// use std::collections::HashMap; /// -/// let df_schema = DFSchema::from_unqualified_fields(vec![ -/// Field::new("c1", arrow::datatypes::DataType::Int32, false), -/// ].into(),HashMap::new()).unwrap(); +/// let df_schema = DFSchema::from_unqualified_fields( +/// vec![Field::new("c1", arrow::datatypes::DataType::Int32, false)].into(), +/// HashMap::new(), +/// ) +/// .unwrap(); /// let schema: &Schema = df_schema.as_arrow(); /// assert_eq!(schema.fields().len(), 1); /// ``` @@ -884,22 +882,26 @@ impl DFSchema { /// # Example /// /// ``` - /// use datafusion_common::DFSchema; /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_common::DFSchema; /// use std::collections::HashMap; /// /// let schema = DFSchema::from_unqualified_fields( /// vec![ /// Field::new("id", DataType::Int32, false), /// Field::new("name", DataType::Utf8, true), - /// ].into(), - /// HashMap::new() - /// ).unwrap(); + /// ] + /// .into(), + /// HashMap::new(), + /// ) + /// .unwrap(); /// - /// assert_eq!(schema.tree_string().to_string(), - /// r#"root + /// assert_eq!( + /// schema.tree_string().to_string(), + /// r#"root /// |-- id: int32 (nullable = false) - /// |-- name: utf8 (nullable = true)"#); + /// |-- name: utf8 (nullable = true)"# + /// ); /// ``` pub fn tree_string(&self) -> impl Display + '_ { let mut result = String::from("root\n"); diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index 0dce8e6a56ec..b25bf1c12e44 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -30,8 +30,11 @@ use crate::Span; /// ```rust /// # use datafusion_common::{Location, Span, Diagnostic}; /// let span = Some(Span { -/// start: Location{ line: 2, column: 1 }, -/// end: Location{ line: 4, column: 15 } +/// start: Location { line: 2, column: 1 }, +/// end: Location { +/// line: 4, +/// column: 15, +/// }, /// }); /// let diagnostic = Diagnostic::new_error("Something went wrong", span) /// .with_help("Have you tried turning it on and off again?", None); diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 210f0442972d..fde52944d049 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -684,7 +684,10 @@ impl DataFusionError { /// let mut builder = DataFusionError::builder(); /// builder.add_error(DataFusionError::Internal("foo".to_owned())); /// // ok_or returns the value if no errors have been added -/// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo"); +/// assert_contains!( +/// builder.error_or(42).unwrap_err().to_string(), +/// "Internal error: foo" +/// ); /// ``` #[derive(Debug, Default)] pub struct DataFusionErrorBuilder(Vec); @@ -702,7 +705,10 @@ impl DataFusionErrorBuilder { /// # use datafusion_common::{assert_contains, DataFusionError}; /// let mut builder = DataFusionError::builder(); /// builder.add_error(DataFusionError::Internal("foo".to_owned())); - /// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo"); + /// assert_contains!( + /// builder.error_or(42).unwrap_err().to_string(), + /// "Internal error: foo" + /// ); /// ``` pub fn add_error(&mut self, error: DataFusionError) { self.0.push(error); @@ -714,8 +720,11 @@ impl DataFusionErrorBuilder { /// ``` /// # use datafusion_common::{assert_contains, DataFusionError}; /// let builder = DataFusionError::builder() - /// .with_error(DataFusionError::Internal("foo".to_owned())); - /// assert_contains!(builder.error_or(42).unwrap_err().to_string(), "Internal error: foo"); + /// .with_error(DataFusionError::Internal("foo".to_owned())); + /// assert_contains!( + /// builder.error_or(42).unwrap_err().to_string(), + /// "Internal error: foo" + /// ); /// ``` pub fn with_error(mut self, error: DataFusionError) -> Self { self.0.push(error); diff --git a/datafusion/common/src/metadata.rs b/datafusion/common/src/metadata.rs index 39065808efb9..3a10cc2b42f9 100644 --- a/datafusion/common/src/metadata.rs +++ b/datafusion/common/src/metadata.rs @@ -171,7 +171,6 @@ pub fn format_type_and_metadata( /// // Add any metadata from `FieldMetadata` to `Field` /// let updated_field = metadata.add_to_field(field); /// ``` -/// #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] pub struct FieldMetadata { /// The inner metadata of a literal expression, which is a map of string diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index 38060e370bfa..d43816f75b0e 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -110,16 +110,19 @@ fn cast_struct_column( /// temporal values are formatted when cast to strings. /// /// ``` -/// use std::sync::Arc; -/// use arrow::array::{Int64Array, ArrayRef}; +/// use arrow::array::{ArrayRef, Int64Array}; /// use arrow::compute::CastOptions; /// use arrow::datatypes::{DataType, Field}; /// use datafusion_common::nested_struct::cast_column; +/// use std::sync::Arc; /// /// let source: ArrayRef = Arc::new(Int64Array::from(vec![1, i64::MAX])); /// let target = Field::new("ints", DataType::Int32, true); /// // Permit lossy conversions by producing NULL on overflow instead of erroring -/// let options = CastOptions { safe: true, ..Default::default() }; +/// let options = CastOptions { +/// safe: true, +/// ..Default::default() +/// }; /// let result = cast_column(&source, &target, &options).unwrap(); /// assert!(result.is_null(1)); /// ``` diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index a70a027a8fac..f2546040ffd7 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -171,9 +171,9 @@ pub use struct_builder::ScalarStructBuilder; /// let field_b = Field::new("b", DataType::Utf8, false); /// /// let s1 = ScalarStructBuilder::new() -/// .with_scalar(field_a, ScalarValue::from(1i32)) -/// .with_scalar(field_b, ScalarValue::from("foo")) -/// .build(); +/// .with_scalar(field_a, ScalarValue::from(1i32)) +/// .with_scalar(field_b, ScalarValue::from("foo")) +/// .build(); /// ``` /// /// ## Example: Creating a null [`ScalarValue::Struct`] using [`ScalarStructBuilder`] @@ -199,13 +199,13 @@ pub use struct_builder::ScalarStructBuilder; /// // Build a struct like: {a: 1, b: "foo"} /// // Field description /// let fields = Fields::from(vec![ -/// Field::new("a", DataType::Int32, false), -/// Field::new("b", DataType::Utf8, false), +/// Field::new("a", DataType::Int32, false), +/// Field::new("b", DataType::Utf8, false), /// ]); /// // one row arrays for each field /// let arrays: Vec = vec![ -/// Arc::new(Int32Array::from(vec![1])), -/// Arc::new(StringArray::from(vec!["foo"])), +/// Arc::new(Int32Array::from(vec![1])), +/// Arc::new(StringArray::from(vec!["foo"])), /// ]; /// // no nulls for this array /// let nulls = None; @@ -1068,8 +1068,8 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::datatypes::DataType; + /// use datafusion_common::ScalarValue; /// /// let scalar = ScalarValue::try_new_null(&DataType::Int32).unwrap(); /// assert_eq!(scalar.is_null(), true); @@ -2231,23 +2231,16 @@ impl ScalarValue { /// /// # Example /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::{BooleanArray, Int32Array}; + /// use datafusion_common::ScalarValue; /// /// let arr = Int32Array::from(vec![Some(1), None, Some(10)]); /// let five = ScalarValue::Int32(Some(5)); /// - /// let result = arrow::compute::kernels::cmp::lt( - /// &arr, - /// &five.to_scalar().unwrap(), - /// ).unwrap(); + /// let result = + /// arrow::compute::kernels::cmp::lt(&arr, &five.to_scalar().unwrap()).unwrap(); /// - /// let expected = BooleanArray::from(vec![ - /// Some(true), - /// None, - /// Some(false) - /// ] - /// ); + /// let expected = BooleanArray::from(vec![Some(true), None, Some(false)]); /// /// assert_eq!(&result, &expected); /// ``` @@ -2265,26 +2258,20 @@ impl ScalarValue { /// /// # Example /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::{ArrayRef, BooleanArray}; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Boolean(Some(true)), - /// ScalarValue::Boolean(None), - /// ScalarValue::Boolean(Some(false)), + /// ScalarValue::Boolean(Some(true)), + /// ScalarValue::Boolean(None), + /// ScalarValue::Boolean(Some(false)), /// ]; /// /// // Build an Array from the list of ScalarValues - /// let array = ScalarValue::iter_to_array(scalars.into_iter()) - /// .unwrap(); + /// let array = ScalarValue::iter_to_array(scalars.into_iter()).unwrap(); /// - /// let expected: ArrayRef = std::sync::Arc::new( - /// BooleanArray::from(vec![ - /// Some(true), - /// None, - /// Some(false) - /// ] - /// )); + /// let expected: ArrayRef = + /// std::sync::Arc::new(BooleanArray::from(vec![Some(true), None, Some(false)])); /// /// assert_eq!(&array, &expected); /// ``` @@ -2731,23 +2718,24 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; - /// use arrow::array::{ListArray, Int32Array}; + /// use arrow::array::{Int32Array, ListArray}; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::cast::as_list_array; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(None), - /// ScalarValue::Int32(Some(2)) + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(None), + /// ScalarValue::Int32(Some(2)), /// ]; /// /// let result = ScalarValue::new_list(&scalars, &DataType::Int32, true); /// - /// let expected = ListArray::from_iter_primitive::( - /// vec![ - /// Some(vec![Some(1), None, Some(2)]) - /// ]); + /// let expected = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// None, + /// Some(2), + /// ])]); /// /// assert_eq!(*result, expected); /// ``` @@ -2791,23 +2779,25 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; - /// use arrow::array::{ListArray, Int32Array}; + /// use arrow::array::{Int32Array, ListArray}; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::cast::as_list_array; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(None), - /// ScalarValue::Int32(Some(2)) + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(None), + /// ScalarValue::Int32(Some(2)), /// ]; /// - /// let result = ScalarValue::new_list_from_iter(scalars.into_iter(), &DataType::Int32, true); + /// let result = + /// ScalarValue::new_list_from_iter(scalars.into_iter(), &DataType::Int32, true); /// - /// let expected = ListArray::from_iter_primitive::( - /// vec![ - /// Some(vec![Some(1), None, Some(2)]) - /// ]); + /// let expected = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// None, + /// Some(2), + /// ])]); /// /// assert_eq!(*result, expected); /// ``` @@ -2833,23 +2823,25 @@ impl ScalarValue { /// /// Example /// ``` - /// use datafusion_common::ScalarValue; - /// use arrow::array::{LargeListArray, Int32Array}; + /// use arrow::array::{Int32Array, LargeListArray}; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::cast::as_large_list_array; + /// use datafusion_common::ScalarValue; /// /// let scalars = vec![ - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(None), - /// ScalarValue::Int32(Some(2)) + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(None), + /// ScalarValue::Int32(Some(2)), /// ]; /// /// let result = ScalarValue::new_large_list(&scalars, &DataType::Int32); /// - /// let expected = LargeListArray::from_iter_primitive::( - /// vec![ - /// Some(vec![Some(1), None, Some(2)]) - /// ]); + /// let expected = + /// LargeListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// None, + /// Some(2), + /// ])]); /// /// assert_eq!(*result, expected); /// ``` @@ -3248,14 +3240,14 @@ impl ScalarValue { /// /// Example 1: Array (ScalarValue::Int32) /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::ListArray; /// use arrow::datatypes::{DataType, Int32Type}; + /// use datafusion_common::ScalarValue; /// /// // Equivalent to [[1,2,3], [4,5]] /// let list_arr = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// Some(vec![Some(4), Some(5)]) + /// Some(vec![Some(1), Some(2), Some(3)]), + /// Some(vec![Some(4), Some(5)]), /// ]); /// /// // Convert the array into Scalar Values for each row @@ -3278,15 +3270,15 @@ impl ScalarValue { /// /// Example 2: Nested array (ScalarValue::List) /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::ListArray; /// use arrow::datatypes::{DataType, Int32Type}; /// use datafusion_common::utils::SingleRowListArrayBuilder; + /// use datafusion_common::ScalarValue; /// use std::sync::Arc; /// /// let list_arr = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// Some(vec![Some(4), Some(5)]) + /// Some(vec![Some(1), Some(2), Some(3)]), + /// Some(vec![Some(4), Some(5)]), /// ]); /// /// // Wrap into another layer of list, we got nested array as [ [[1,2,3], [4,5]] ] @@ -3295,33 +3287,34 @@ impl ScalarValue { /// // Convert the array into Scalar Values for each row, we got 1D arrays in this example /// let scalar_vec = ScalarValue::convert_array_to_scalar_vec(&list_arr).unwrap(); /// - /// let l1 = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// ]); - /// let l2 = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(4), Some(5)]), - /// ]); + /// let l1 = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(1), + /// Some(2), + /// Some(3), + /// ])]); + /// let l2 = ListArray::from_iter_primitive::(vec![Some(vec![ + /// Some(4), + /// Some(5), + /// ])]); /// - /// let expected = vec![ - /// Some(vec![ + /// let expected = vec![Some(vec![ /// ScalarValue::List(Arc::new(l1)), /// ScalarValue::List(Arc::new(l2)), - /// ]), - /// ]; + /// ])]; /// /// assert_eq!(scalar_vec, expected); /// ``` /// /// Example 3: Nullable array /// ``` - /// use datafusion_common::ScalarValue; /// use arrow::array::ListArray; /// use arrow::datatypes::{DataType, Int32Type}; + /// use datafusion_common::ScalarValue; /// /// let list_arr = ListArray::from_iter_primitive::(vec![ - /// Some(vec![Some(1), Some(2), Some(3)]), - /// None, - /// Some(vec![Some(4), Some(5)]) + /// Some(vec![Some(1), Some(2), Some(3)]), + /// None, + /// Some(vec![Some(4), Some(5)]), /// ]); /// /// // Convert the array into Scalar Values for each row diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs index fd19dccf8963..56daee904514 100644 --- a/datafusion/common/src/scalar/struct_builder.rs +++ b/datafusion/common/src/scalar/struct_builder.rs @@ -47,13 +47,11 @@ impl ScalarStructBuilder { /// ```rust /// # use arrow::datatypes::{DataType, Field}; /// # use datafusion_common::scalar::ScalarStructBuilder; - /// let fields = vec![ - /// Field::new("a", DataType::Int32, false), - /// ]; + /// let fields = vec![Field::new("a", DataType::Int32, false)]; /// let sv = ScalarStructBuilder::new_null(fields); /// // Note this is `NULL`, not `{a: NULL}` /// assert_eq!(format!("{sv}"), "NULL"); - ///``` + /// ``` /// /// To create a struct where the *fields* are null, use `Self::new()` and /// pass null values for each field: @@ -65,9 +63,9 @@ impl ScalarStructBuilder { /// let field = Field::new("a", DataType::Int32, true); /// // add a null value for the "a" field /// let sv = ScalarStructBuilder::new() - /// .with_scalar(field, ScalarValue::Int32(None)) - /// .build() - /// .unwrap(); + /// .with_scalar(field, ScalarValue::Int32(None)) + /// .build() + /// .unwrap(); /// // value is not null, but field is /// assert_eq!(format!("{sv}"), "{a:}"); /// ``` diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index 2481a88676ef..da298c20ebcb 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -520,33 +520,35 @@ impl Statistics { /// # use arrow::datatypes::{Field, Schema, DataType}; /// # use datafusion_common::stats::Precision; /// let stats1 = Statistics::default() - /// .with_num_rows(Precision::Exact(1)) - /// .with_total_byte_size(Precision::Exact(2)) - /// .add_column_statistics(ColumnStatistics::new_unknown() - /// .with_null_count(Precision::Exact(3)) - /// .with_min_value(Precision::Exact(ScalarValue::from(4))) - /// .with_max_value(Precision::Exact(ScalarValue::from(5))) - /// ); + /// .with_num_rows(Precision::Exact(1)) + /// .with_total_byte_size(Precision::Exact(2)) + /// .add_column_statistics( + /// ColumnStatistics::new_unknown() + /// .with_null_count(Precision::Exact(3)) + /// .with_min_value(Precision::Exact(ScalarValue::from(4))) + /// .with_max_value(Precision::Exact(ScalarValue::from(5))), + /// ); /// /// let stats2 = Statistics::default() - /// .with_num_rows(Precision::Exact(10)) - /// .with_total_byte_size(Precision::Inexact(20)) - /// .add_column_statistics(ColumnStatistics::new_unknown() - /// // absent null count - /// .with_min_value(Precision::Exact(ScalarValue::from(40))) - /// .with_max_value(Precision::Exact(ScalarValue::from(50))) - /// ); + /// .with_num_rows(Precision::Exact(10)) + /// .with_total_byte_size(Precision::Inexact(20)) + /// .add_column_statistics( + /// ColumnStatistics::new_unknown() + /// // absent null count + /// .with_min_value(Precision::Exact(ScalarValue::from(40))) + /// .with_max_value(Precision::Exact(ScalarValue::from(50))), + /// ); /// /// let merged_stats = stats1.try_merge(&stats2).unwrap(); /// let expected_stats = Statistics::default() - /// .with_num_rows(Precision::Exact(11)) - /// .with_total_byte_size(Precision::Inexact(22)) // inexact in stats2 --> inexact - /// .add_column_statistics( - /// ColumnStatistics::new_unknown() - /// .with_null_count(Precision::Absent) // missing from stats2 --> absent - /// .with_min_value(Precision::Exact(ScalarValue::from(4))) - /// .with_max_value(Precision::Exact(ScalarValue::from(50))) - /// ); + /// .with_num_rows(Precision::Exact(11)) + /// .with_total_byte_size(Precision::Inexact(22)) // inexact in stats2 --> inexact + /// .add_column_statistics( + /// ColumnStatistics::new_unknown() + /// .with_null_count(Precision::Absent) // missing from stats2 --> absent + /// .with_min_value(Precision::Exact(ScalarValue::from(4))) + /// .with_max_value(Precision::Exact(ScalarValue::from(50))), + /// ); /// /// assert_eq!(merged_stats, expected_stats) /// ``` diff --git a/datafusion/common/src/table_reference.rs b/datafusion/common/src/table_reference.rs index 574465856760..3163a8b16c8d 100644 --- a/datafusion/common/src/table_reference.rs +++ b/datafusion/common/src/table_reference.rs @@ -69,8 +69,11 @@ impl std::fmt::Display for ResolvedTableReference { /// /// // Get a table reference to 'myschema.mytable' (note the capitalization) /// let table_reference = TableReference::from("MySchema.MyTable"); -/// assert_eq!(table_reference, TableReference::partial("myschema", "mytable")); -///``` +/// assert_eq!( +/// table_reference, +/// TableReference::partial("myschema", "mytable") +/// ); +/// ``` #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum TableReference { /// An unqualified table reference, e.g. "table" @@ -247,7 +250,10 @@ impl TableReference { /// assert_eq!(table_reference.to_quoted_string(), "myschema.mytable"); /// /// let table_reference = TableReference::partial("MySchema", "MyTable"); - /// assert_eq!(table_reference.to_quoted_string(), r#""MySchema"."MyTable""#); + /// assert_eq!( + /// table_reference.to_quoted_string(), + /// r#""MySchema"."MyTable""# + /// ); /// ``` pub fn to_quoted_string(&self) -> String { match self { diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs index d97d4003e729..c51dea1c4de0 100644 --- a/datafusion/common/src/test_util.rs +++ b/datafusion/common/src/test_util.rs @@ -55,7 +55,7 @@ pub fn format_batches(results: &[RecordBatch]) -> Result i64 { 2 } /// let expr = orig_expr(); /// let ret = Transformed::no(expr.clone()) -/// .transform_data(|expr| { -/// // closure returns a result and potentially transforms the node -/// // in this example, it does transform the node -/// let new_expr = make_new_expr(expr); -/// Ok(Transformed::yes(new_expr)) -/// }).unwrap(); +/// .transform_data(|expr| { +/// // closure returns a result and potentially transforms the node +/// // in this example, it does transform the node +/// let new_expr = make_new_expr(expr); +/// Ok(Transformed::yes(new_expr)) +/// }) +/// .unwrap(); /// // transformed flag is the union of the original ans closure's transformed flag /// assert!(ret.transformed); /// ``` diff --git a/datafusion/common/src/types/logical.rs b/datafusion/common/src/types/logical.rs index eb7cf88e0075..674b1a41204d 100644 --- a/datafusion/common/src/types/logical.rs +++ b/datafusion/common/src/types/logical.rs @@ -67,12 +67,12 @@ pub type LogicalTypeRef = Arc; /// &NativeType::String /// } /// -/// fn signature(&self) -> TypeSignature<'_> { -/// TypeSignature::Extension { -/// name: "JSON", -/// parameters: &[], -/// } -/// } +/// fn signature(&self) -> TypeSignature<'_> { +/// TypeSignature::Extension { +/// name: "JSON", +/// parameters: &[], +/// } +/// } /// } /// ``` pub trait LogicalType: Sync + Send { diff --git a/datafusion/common/src/utils/memory.rs b/datafusion/common/src/utils/memory.rs index 29e523996cf4..a56b940fab66 100644 --- a/datafusion/common/src/utils/memory.rs +++ b/datafusion/common/src/utils/memory.rs @@ -56,8 +56,8 @@ use std::mem::size_of; /// impl MyStruct { /// fn size(&self) -> Result { /// let num_elements = self.values.len(); -/// let fixed_size = std::mem::size_of_val(self) + -/// std::mem::size_of_val(&self.values); +/// let fixed_size = +/// std::mem::size_of_val(self) + std::mem::size_of_val(&self.values); /// /// estimate_memory_size::(num_elements, fixed_size) /// } @@ -73,8 +73,8 @@ use std::mem::size_of; /// let num_rows = 100; /// let fixed_size = std::mem::size_of::>(); /// let estimated_hashtable_size = -/// estimate_memory_size::<(u64, u64)>(num_rows,fixed_size) -/// .expect("Size estimation failed"); +/// estimate_memory_size::<(u64, u64)>(num_rows, fixed_size) +/// .expect("Size estimation failed"); /// ``` pub fn estimate_memory_size(num_elements: usize, fixed_size: usize) -> Result { // For the majority of cases hashbrown overestimates the bucket quantity diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index 045c02a5a2aa..7b145ac3ae21 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -46,26 +46,23 @@ use std::thread::available_parallelism; /// /// Example: /// ``` -/// use arrow::datatypes::{SchemaRef, Schema, Field, DataType}; +/// use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; /// use datafusion_common::project_schema; /// /// // Schema with columns 'a', 'b', and 'c' /// let schema = SchemaRef::new(Schema::new(vec![ -/// Field::new("a", DataType::Int32, true), -/// Field::new("b", DataType::Int64, true), -/// Field::new("c", DataType::Utf8, true), +/// Field::new("a", DataType::Int32, true), +/// Field::new("b", DataType::Int64, true), +/// Field::new("c", DataType::Utf8, true), /// ])); /// /// // Pick columns 'c' and 'b' -/// let projection = Some(vec![2,1]); -/// let projected_schema = project_schema( -/// &schema, -/// projection.as_ref() -/// ).unwrap(); +/// let projection = Some(vec![2, 1]); +/// let projected_schema = project_schema(&schema, projection.as_ref()).unwrap(); /// /// let expected_schema = SchemaRef::new(Schema::new(vec![ -/// Field::new("c", DataType::Utf8, true), -/// Field::new("b", DataType::Int64, true), +/// Field::new("c", DataType::Utf8, true), +/// Field::new("b", DataType::Int64, true), /// ])); /// /// assert_eq!(projected_schema, expected_schema); @@ -398,9 +395,11 @@ pub fn longest_consecutive_prefix>( /// # use arrow::array::types::Int64Type; /// # use datafusion_common::utils::SingleRowListArrayBuilder; /// // Array is [1, 2, 3] -/// let arr = ListArray::from_iter_primitive::(vec![ -/// Some(vec![Some(1), Some(2), Some(3)]), -/// ]); +/// let arr = ListArray::from_iter_primitive::(vec![Some(vec![ +/// Some(1), +/// Some(2), +/// Some(3), +/// ])]); /// // Wrap as a list array: [[1, 2, 3]] /// let list_arr = SingleRowListArrayBuilder::new(Arc::new(arr)).build_list_array(); /// assert_eq!(list_arr.len(), 1); @@ -554,7 +553,8 @@ pub fn fixed_size_list_to_arrays(a: &ArrayRef) -> Vec { /// use datafusion_common::utils::base_type; /// use std::sync::Arc; /// -/// let data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); +/// let data_type = +/// DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); /// assert_eq!(base_type(&data_type), DataType::Int32); /// /// let data_type = DataType::Int32; @@ -906,16 +906,19 @@ pub fn get_available_parallelism() -> usize { /// # use datafusion_common::utils::take_function_args; /// # use datafusion_common::ScalarValue; /// fn my_function(args: &[ScalarValue]) -> Result<()> { -/// // function expects 2 args, so create a 2-element array -/// let [arg1, arg2] = take_function_args("my_function", args)?; -/// // ... do stuff.. -/// Ok(()) +/// // function expects 2 args, so create a 2-element array +/// let [arg1, arg2] = take_function_args("my_function", args)?; +/// // ... do stuff.. +/// Ok(()) /// } /// /// // Calling the function with 1 argument produces an error: /// let args = vec![ScalarValue::Int32(Some(10))]; /// let err = my_function(&args).unwrap_err(); -/// assert_eq!(err.to_string(), "Execution error: my_function function requires 2 arguments, got 1"); +/// assert_eq!( +/// err.to_string(), +/// "Execution error: my_function function requires 2 arguments, got 1" +/// ); /// // Calling the function with 2 arguments works great /// let args = vec![ScalarValue::Int32(Some(10)), ScalarValue::Int32(Some(20))]; /// my_function(&args).unwrap(); diff --git a/datafusion/common/src/utils/proxy.rs b/datafusion/common/src/utils/proxy.rs index d940677a5fb3..fb951aa3b028 100644 --- a/datafusion/common/src/utils/proxy.rs +++ b/datafusion/common/src/utils/proxy.rs @@ -47,7 +47,9 @@ pub trait VecAllocExt { /// assert_eq!(allocated, 16); // no new allocation needed /// /// // push more data into the vec - /// for _ in 0..10 { vec.push_accounted(1, &mut allocated); } + /// for _ in 0..10 { + /// vec.push_accounted(1, &mut allocated); + /// } /// assert_eq!(allocated, 64); // underlying vec has space for 10 u32s /// assert_eq!(vec.allocated_size(), 64); /// ``` @@ -82,7 +84,9 @@ pub trait VecAllocExt { /// assert_eq!(vec.allocated_size(), 16); // no new allocation needed /// /// // push more data into the vec - /// for _ in 0..10 { vec.push(1); } + /// for _ in 0..10 { + /// vec.push(1); + /// } /// assert_eq!(vec.allocated_size(), 64); // space for 64 now /// ``` fn allocated_size(&self) -> usize; @@ -133,7 +137,9 @@ pub trait RawTableAllocExt { /// assert_eq!(allocated, 64); /// /// // insert more values - /// for i in 0..100 { table.insert_accounted(i, hash_fn, &mut allocated); } + /// for i in 0..100 { + /// table.insert_accounted(i, hash_fn, &mut allocated); + /// } /// assert_eq!(allocated, 400); /// ``` fn insert_accounted( @@ -200,7 +206,9 @@ pub trait HashTableAllocExt { /// assert_eq!(allocated, 64); /// /// // insert more values - /// for i in 0..100 { table.insert_accounted(i, hash_fn, &mut allocated); } + /// for i in 0..100 { + /// table.insert_accounted(i, hash_fn, &mut allocated); + /// } /// assert_eq!(allocated, 400); /// ``` fn insert_accounted( From d8d8ccc25266069bea0cbd2dea77159ddcafecd6 Mon Sep 17 00:00:00 2001 From: Emily Matheys <55631053+EmilyMatt@users.noreply.github.com> Date: Tue, 28 Oct 2025 23:59:07 +0200 Subject: [PATCH 040/157] feat: Improve metrics for aggregate streams. (#18325) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes #18323 . ## Rationale for this change Adds more detailed metrics, so it is easier to identify which part of the aggregate streams are actually slow. ## What changes are included in this PR? Added a metrics struct, and used it in the functions common to the aggregate streams. ## Are these changes tested? Yes, added some tests to verify the metrics are actually updated and can be retrieved. I've also ran the groupby benchmarks to ensure we don't create timers in a way that could impact performance, and it seems ok, all the changes are within what I'd expect as std variation on a local machine. ``` Comparing main and agg-metrics -------------------- Benchmark h2o.json -------------------- ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Query ┃ main ┃ agg-metrics ┃ Change ┃ ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ QQuery 1 │ 1252.42 ms │ 1196.62 ms │ no change │ │ QQuery 2 │ 3976.62 ms │ 3392.89 ms │ +1.17x faster │ │ QQuery 3 │ 3448.29 ms │ 2918.47 ms │ +1.18x faster │ │ QQuery 4 │ 1909.15 ms │ 1632.98 ms │ +1.17x faster │ │ QQuery 5 │ 3056.36 ms │ 2831.82 ms │ +1.08x faster │ │ QQuery 6 │ 2663.13 ms │ 2594.64 ms │ no change │ │ QQuery 7 │ 2802.28 ms │ 2592.43 ms │ +1.08x faster │ │ QQuery 8 │ 4489.29 ms │ 4199.00 ms │ +1.07x faster │ │ QQuery 9 │ 7001.75 ms │ 6622.98 ms │ +1.06x faster │ │ QQuery 10 │ 4725.80 ms │ 4619.37 ms │ no change │ └──────────────┴────────────┴─────────────┴───────────────┘ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓ ┃ Benchmark Summary ┃ ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩ │ Total Time (main) │ 35325.09ms │ │ Total Time (agg-metrics) │ 32601.19ms │ │ Average Time (main) │ 3532.51ms │ │ Average Time (agg-metrics) │ 3260.12ms │ │ Queries Faster │ 7 │ │ Queries Slower │ 0 │ │ Queries with No Change │ 3 │ │ Queries with Failure │ 0 │ └────────────────────────────┴────────────┘ ``` ## Are there any user-facing changes? Nothing that is direct to the user, additional metrics will now be available, but no breaking changes. --------- Co-authored-by: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Co-authored-by: Eshed Schacham --- .../src/aggregates/group_values/metrics.rs | 214 ++++++++++++++++++ .../src/aggregates/group_values/mod.rs | 3 + .../physical-plan/src/aggregates/row_hash.rs | 35 ++- .../src/aggregates/topk_stream.rs | 33 ++- 4 files changed, 278 insertions(+), 7 deletions(-) create mode 100644 datafusion/physical-plan/src/aggregates/group_values/metrics.rs diff --git a/datafusion/physical-plan/src/aggregates/group_values/metrics.rs b/datafusion/physical-plan/src/aggregates/group_values/metrics.rs new file mode 100644 index 000000000000..c4e29ea71060 --- /dev/null +++ b/datafusion/physical-plan/src/aggregates/group_values/metrics.rs @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Metrics for the various group-by implementations. + +use crate::metrics::{ExecutionPlanMetricsSet, MetricBuilder, Time}; + +pub(crate) struct GroupByMetrics { + /// Time spent calculating the group IDs from the evaluated grouping columns. + pub(crate) time_calculating_group_ids: Time, + /// Time spent evaluating the inputs to the aggregate functions. + pub(crate) aggregate_arguments_time: Time, + /// Time spent evaluating the aggregate expressions themselves + /// (e.g. summing all elements and counting number of elements for `avg` aggregate). + pub(crate) aggregation_time: Time, + /// Time spent emitting the final results and constructing the record batch + /// which includes finalizing the grouping expressions + /// (e.g. emit from the hash table in case of hash aggregation) and the accumulators + pub(crate) emitting_time: Time, +} + +impl GroupByMetrics { + pub(crate) fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self { + Self { + time_calculating_group_ids: MetricBuilder::new(metrics) + .subset_time("time_calculating_group_ids", partition), + aggregate_arguments_time: MetricBuilder::new(metrics) + .subset_time("aggregate_arguments_time", partition), + aggregation_time: MetricBuilder::new(metrics) + .subset_time("aggregation_time", partition), + emitting_time: MetricBuilder::new(metrics) + .subset_time("emitting_time", partition), + } + } +} + +#[cfg(test)] +mod tests { + use crate::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}; + use crate::metrics::MetricsSet; + use crate::test::TestMemoryExec; + use crate::{collect, ExecutionPlan}; + use arrow::array::{Float64Array, UInt32Array}; + use arrow::datatypes::{DataType, Field, Schema}; + use arrow::record_batch::RecordBatch; + use datafusion_common::Result; + use datafusion_execution::TaskContext; + use datafusion_functions_aggregate::count::count_udaf; + use datafusion_functions_aggregate::sum::sum_udaf; + use datafusion_physical_expr::aggregate::AggregateExprBuilder; + use datafusion_physical_expr::expressions::col; + use std::sync::Arc; + + /// Helper function to verify all three GroupBy metrics exist and have non-zero values + fn assert_groupby_metrics(metrics: &MetricsSet) { + let agg_arguments_time = metrics.sum_by_name("aggregate_arguments_time"); + assert!(agg_arguments_time.is_some()); + assert!(agg_arguments_time.unwrap().as_usize() > 0); + + let aggregation_time = metrics.sum_by_name("aggregation_time"); + assert!(aggregation_time.is_some()); + assert!(aggregation_time.unwrap().as_usize() > 0); + + let emitting_time = metrics.sum_by_name("emitting_time"); + assert!(emitting_time.is_some()); + assert!(emitting_time.unwrap().as_usize() > 0); + } + + #[tokio::test] + async fn test_groupby_metrics_partial_mode() -> Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::UInt32, false), + Field::new("b", DataType::Float64, false), + ])); + + // Create multiple batches to ensure metrics accumulate + let batches = (0..5) + .map(|i| { + RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(UInt32Array::from(vec![1, 2, 3, 4])), + Arc::new(Float64Array::from(vec![ + i as f64, + (i + 1) as f64, + (i + 2) as f64, + (i + 3) as f64, + ])), + ], + ) + .unwrap() + }) + .collect::>(); + + let input = TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?; + + let group_by = + PhysicalGroupBy::new_single(vec![(col("a", &schema)?, "a".to_string())]); + + let aggregates = vec![ + Arc::new( + AggregateExprBuilder::new(sum_udaf(), vec![col("b", &schema)?]) + .schema(Arc::clone(&schema)) + .alias("SUM(b)") + .build()?, + ), + Arc::new( + AggregateExprBuilder::new(count_udaf(), vec![col("b", &schema)?]) + .schema(Arc::clone(&schema)) + .alias("COUNT(b)") + .build()?, + ), + ]; + + let aggregate_exec = Arc::new(AggregateExec::try_new( + AggregateMode::Partial, + group_by, + aggregates, + vec![None, None], + input, + schema, + )?); + + let task_ctx = Arc::new(TaskContext::default()); + let _result = + collect(Arc::clone(&aggregate_exec) as _, Arc::clone(&task_ctx)).await?; + + let metrics = aggregate_exec.metrics().unwrap(); + assert_groupby_metrics(&metrics); + + Ok(()) + } + + #[tokio::test] + async fn test_groupby_metrics_final_mode() -> Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::UInt32, false), + Field::new("b", DataType::Float64, false), + ])); + + let batches = (0..3) + .map(|i| { + RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(UInt32Array::from(vec![1, 2, 3])), + Arc::new(Float64Array::from(vec![ + i as f64, + (i + 1) as f64, + (i + 2) as f64, + ])), + ], + ) + .unwrap() + }) + .collect::>(); + + let partial_input = + TestMemoryExec::try_new_exec(&[batches], Arc::clone(&schema), None)?; + + let group_by = + PhysicalGroupBy::new_single(vec![(col("a", &schema)?, "a".to_string())]); + + let aggregates = vec![Arc::new( + AggregateExprBuilder::new(sum_udaf(), vec![col("b", &schema)?]) + .schema(Arc::clone(&schema)) + .alias("SUM(b)") + .build()?, + )]; + + // Create partial aggregate + let partial_aggregate = Arc::new(AggregateExec::try_new( + AggregateMode::Partial, + group_by.clone(), + aggregates.clone(), + vec![None], + partial_input, + Arc::clone(&schema), + )?); + + // Create final aggregate + let final_aggregate = Arc::new(AggregateExec::try_new( + AggregateMode::Final, + group_by.as_final(), + aggregates, + vec![None], + partial_aggregate, + schema, + )?); + + let task_ctx = Arc::new(TaskContext::default()); + let _result = + collect(Arc::clone(&final_aggregate) as _, Arc::clone(&task_ctx)).await?; + + let metrics = final_aggregate.metrics().unwrap(); + assert_groupby_metrics(&metrics); + + Ok(()) + } +} diff --git a/datafusion/physical-plan/src/aggregates/group_values/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/mod.rs index 316fbe11ae31..5f2a2faa1112 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/mod.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/mod.rs @@ -46,8 +46,11 @@ use crate::aggregates::{ order::GroupOrdering, }; +mod metrics; mod null_builder; +pub(crate) use metrics::GroupByMetrics; + /// Stores the group values during hash aggregation. /// /// # Background diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 6132a8b0add5..98c8cb235ca4 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -23,7 +23,7 @@ use std::vec; use super::order::GroupOrdering; use super::AggregateExec; -use crate::aggregates::group_values::{new_group_values, GroupValues}; +use crate::aggregates::group_values::{new_group_values, GroupByMetrics, GroupValues}; use crate::aggregates::order::GroupOrderingFull; use crate::aggregates::{ create_schema, evaluate_group_by, evaluate_many, evaluate_optional, AggregateMode, @@ -49,6 +49,7 @@ use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::{GroupsAccumulatorAdapter, PhysicalSortExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; +use datafusion_common::instant::Instant; use futures::ready; use futures::stream::{Stream, StreamExt}; use log::debug; @@ -430,6 +431,9 @@ pub(crate) struct GroupedHashAggregateStream { /// Execution metrics baseline_metrics: BaselineMetrics, + + /// Aggregation-specific metrics + group_by_metrics: GroupByMetrics, } impl GroupedHashAggregateStream { @@ -447,6 +451,7 @@ impl GroupedHashAggregateStream { let batch_size = context.session_config().batch_size(); let input = agg.input.execute(partition, Arc::clone(&context))?; let baseline_metrics = BaselineMetrics::new(&agg.metrics, partition); + let group_by_metrics = GroupByMetrics::new(&agg.metrics, partition); let timer = baseline_metrics.elapsed_compute().timer(); @@ -609,6 +614,7 @@ impl GroupedHashAggregateStream { current_group_indices: Default::default(), exec_state, baseline_metrics, + group_by_metrics, batch_size, group_ordering, input_done: false, @@ -830,12 +836,25 @@ impl GroupedHashAggregateStream { evaluate_group_by(&self.group_by, &batch)? }; + // Only create the timer if there are actual aggregate arguments to evaluate + let timer = match ( + self.spill_state.is_stream_merging, + self.spill_state.merging_aggregate_arguments.is_empty(), + self.aggregate_arguments.is_empty(), + ) { + (true, false, _) | (false, _, false) => { + Some(self.group_by_metrics.aggregate_arguments_time.timer()) + } + _ => None, + }; + // Evaluate the aggregation expressions. let input_values = if self.spill_state.is_stream_merging { evaluate_many(&self.spill_state.merging_aggregate_arguments, &batch)? } else { evaluate_many(&self.aggregate_arguments, &batch)? }; + drop(timer); // Evaluate the filter expressions, if any, against the inputs let filter_values = if self.spill_state.is_stream_merging { @@ -846,6 +865,8 @@ impl GroupedHashAggregateStream { }; for group_values in &group_by_values { + let groups_start_time = Instant::now(); + // calculate the group indices for each input row let starting_num_groups = self.group_values.len(); self.group_values @@ -862,6 +883,12 @@ impl GroupedHashAggregateStream { )?; } + // Use this instant for both measurements to save a syscall + let agg_start_time = Instant::now(); + self.group_by_metrics + .time_calculating_group_ids + .add_duration(agg_start_time - groups_start_time); + // Gather the inputs to call the actual accumulator let t = self .accumulators @@ -897,6 +924,9 @@ impl GroupedHashAggregateStream { acc.merge_batch(values, group_indices, None, total_num_groups)?; } } + self.group_by_metrics + .aggregation_time + .add_elapsed(agg_start_time); } } @@ -941,6 +971,7 @@ impl GroupedHashAggregateStream { return Ok(None); } + let timer = self.group_by_metrics.emitting_time.timer(); let mut output = self.group_values.emit(emit_to)?; if let EmitTo::First(n) = emit_to { self.group_ordering.remove_groups(n); @@ -961,12 +992,14 @@ impl GroupedHashAggregateStream { | AggregateMode::SinglePartitioned => output.push(acc.evaluate(emit_to)?), } } + drop(timer); // emit reduces the memory usage. Ignore Err from update_memory_reservation. Even if it is // over the target memory size after emission, we can emit again rather than returning Err. let _ = self.update_memory_reservation(); let batch = RecordBatch::try_new(schema, output)?; debug_assert!(batch.num_rows() > 0); + Ok(Some(batch)) } diff --git a/datafusion/physical-plan/src/aggregates/topk_stream.rs b/datafusion/physical-plan/src/aggregates/topk_stream.rs index 9aaadfd52b96..eb1b7543cbfd 100644 --- a/datafusion/physical-plan/src/aggregates/topk_stream.rs +++ b/datafusion/physical-plan/src/aggregates/topk_stream.rs @@ -17,11 +17,13 @@ //! A memory-conscious aggregation implementation that limits group buckets to a fixed number +use crate::aggregates::group_values::GroupByMetrics; use crate::aggregates::topk::priority_map::PriorityMap; use crate::aggregates::{ aggregate_expressions, evaluate_group_by, evaluate_many, AggregateExec, PhysicalGroupBy, }; +use crate::metrics::BaselineMetrics; use crate::{RecordBatchStream, SendableRecordBatchStream}; use arrow::array::{Array, ArrayRef, RecordBatch}; use arrow::datatypes::SchemaRef; @@ -42,6 +44,8 @@ pub struct GroupedTopKAggregateStream { started: bool, schema: SchemaRef, input: SendableRecordBatchStream, + baseline_metrics: BaselineMetrics, + group_by_metrics: GroupByMetrics, aggregate_arguments: Vec>>, group_by: PhysicalGroupBy, priority_map: PriorityMap, @@ -57,6 +61,8 @@ impl GroupedTopKAggregateStream { let agg_schema = Arc::clone(&aggr.schema); let group_by = aggr.group_by.clone(); let input = aggr.input.execute(partition, Arc::clone(&context))?; + let baseline_metrics = BaselineMetrics::new(&aggr.metrics, partition); + let group_by_metrics = GroupByMetrics::new(&aggr.metrics, partition); let aggregate_arguments = aggregate_expressions(&aggr.aggr_expr, &aggr.mode, group_by.expr.len())?; let (val_field, desc) = aggr @@ -75,6 +81,8 @@ impl GroupedTopKAggregateStream { row_count: 0, schema: agg_schema, input, + baseline_metrics, + group_by_metrics, aggregate_arguments, group_by, priority_map, @@ -90,6 +98,8 @@ impl RecordBatchStream for GroupedTopKAggregateStream { impl GroupedTopKAggregateStream { fn intern(&mut self, ids: ArrayRef, vals: ArrayRef) -> Result<()> { + let _timer = self.group_by_metrics.time_calculating_group_ids.timer(); + let len = ids.len(); self.priority_map.set_batch(ids, Arc::clone(&vals)); @@ -111,7 +121,10 @@ impl Stream for GroupedTopKAggregateStream { mut self: Pin<&mut Self>, cx: &mut Context<'_>, ) -> Poll> { + let elapsed_compute = self.baseline_metrics.elapsed_compute().clone(); + let emitting_time = self.group_by_metrics.emitting_time.clone(); while let Poll::Ready(res) = self.input.poll_next_unpin(cx) { + let _timer = elapsed_compute.timer(); match res { // got a batch, convert to rows and append to our TreeMap Some(Ok(batch)) => { @@ -140,10 +153,15 @@ impl Stream for GroupedTopKAggregateStream { "Exactly 1 group value required" ); let group_by_values = Arc::clone(&group_by_values[0][0]); - let input_values = evaluate_many( - &self.aggregate_arguments, - batches.first().unwrap(), - )?; + let input_values = { + let _timer = (!self.aggregate_arguments.is_empty()).then(|| { + self.group_by_metrics.aggregate_arguments_time.timer() + }); + evaluate_many( + &self.aggregate_arguments, + batches.first().unwrap(), + )? + }; assert_eq!(input_values.len(), 1, "Exactly 1 input required"); assert_eq!(input_values[0].len(), 1, "Exactly 1 input required"); let input_values = Arc::clone(&input_values[0][0]); @@ -157,8 +175,11 @@ impl Stream for GroupedTopKAggregateStream { trace!("partition {} emit None", self.partition); return Poll::Ready(None); } - let cols = self.priority_map.emit()?; - let batch = RecordBatch::try_new(Arc::clone(&self.schema), cols)?; + let batch = { + let _timer = emitting_time.timer(); + let cols = self.priority_map.emit()?; + RecordBatch::try_new(Arc::clone(&self.schema), cols)? + }; trace!( "partition {} emit batch with {} rows", self.partition, From 44b50c32b55794591a6f9ce78face128599b8f2f Mon Sep 17 00:00:00 2001 From: Yu-Chuan Hung <86523891+CuteChuanChuan@users.noreply.github.com> Date: Wed, 29 Oct 2025 22:56:21 +0800 Subject: [PATCH 041/157] chore: Format examples in doc strings - crate datafusion (#18333) ## Which issue does this PR close? Part of #16915 ## Rationale for this change Format code examples in documentation comments to improve readability and maintain consistent code style across the codebase. This is part of a multi-PR effort to format all doc comment examples and eventually enable CI checks to enforce this formatting. ## What changes are included in this PR? Run `cargo fmt -p datafusion -- --config format_code_in_doc_comments=true` ## Are these changes tested? No testing needed - this is purely a formatting change with no functional modifications. ## Are there any user-facing changes? No - this only affects documentation formatting. --------- Co-authored-by: Andrew Lamb --- datafusion/core/src/dataframe/mod.rs | 282 ++++++++++++------ datafusion/core/src/dataframe/parquet.rs | 16 +- datafusion/core/src/execution/context/csv.rs | 11 +- datafusion/core/src/execution/context/mod.rs | 122 ++++---- .../core/src/execution/session_state.rs | 12 +- datafusion/core/src/lib.rs | 32 +- .../aggregation_fuzzer/context_generator.rs | 1 - .../aggregation_fuzzer/data_generator.rs | 2 - .../fuzz_cases/aggregation_fuzzer/fuzzer.rs | 1 - .../aggregation_fuzzer/query_builder.rs | 4 +- datafusion/core/tests/sql/mod.rs | 1 - .../tests/user_defined/user_defined_plan.rs | 1 - 12 files changed, 297 insertions(+), 188 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 287a133273d8..3186c5cb8230 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -258,10 +258,13 @@ impl DataFrame { /// # async fn main() -> Result<()> { /// // datafusion will parse number as i64 first. /// let sql = "a > 1 and b in (1, 10)"; - /// let expected = col("a").gt(lit(1 as i64)) - /// .and(col("b").in_list(vec![lit(1 as i64), lit(10 as i64)], false)); + /// let expected = col("a") + /// .gt(lit(1 as i64)) + /// .and(col("b").in_list(vec![lit(1 as i64), lit(10 as i64)], false)); /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let expr = df.parse_sql_expr(sql)?; /// assert_eq!(expected, expr); /// # Ok(()) @@ -289,14 +292,16 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.select_columns(&["a", "b"])?; /// let expected = vec![ /// "+---+---+", /// "| a | b |", /// "+---+---+", /// "| 1 | 2 |", - /// "+---+---+" + /// "+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -329,8 +334,10 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let df : DataFrame = df.select_exprs(&["a * b", "c"])?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let df: DataFrame = df.select_exprs(&["a * b", "c"])?; /// # Ok(()) /// # } /// ``` @@ -357,14 +364,16 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.select(vec![col("a"), col("b") * col("c")])?; /// let expected = vec![ /// "+---+-----------------------+", /// "| a | ?table?.b * ?table?.c |", /// "+---+-----------------------+", /// "| 1 | 6 |", - /// "+---+-----------------------+" + /// "+---+-----------------------+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -407,7 +416,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// // +----+----+----+ /// // | a | b | c | /// // +----+----+----+ @@ -419,7 +430,7 @@ impl DataFrame { /// "| b | c |", /// "+---+---+", /// "| 2 | 3 |", - /// "+---+---+" + /// "+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -518,7 +529,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.filter(col("a").lt_eq(col("b")))?; /// // all rows where a <= b are returned /// let expected = vec![ @@ -528,7 +541,7 @@ impl DataFrame { /// "| 1 | 2 | 3 |", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -557,7 +570,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// /// // The following use is the equivalent of "SELECT MIN(b) GROUP BY a" /// let df1 = df.clone().aggregate(vec![col("a")], vec![min(col("b"))])?; @@ -568,7 +583,7 @@ impl DataFrame { /// "| 1 | 2 |", /// "| 4 | 5 |", /// "| 7 | 8 |", - /// "+---+----------------+" + /// "+---+----------------+", /// ]; /// assert_batches_sorted_eq!(expected1, &df1.collect().await?); /// // The following use is the equivalent of "SELECT MIN(b)" @@ -578,7 +593,7 @@ impl DataFrame { /// "| min(?table?.b) |", /// "+----------------+", /// "| 2 |", - /// "+----------------+" + /// "+----------------+", /// ]; /// # assert_batches_sorted_eq!(expected2, &df2.collect().await?); /// # Ok(()) @@ -646,7 +661,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.limit(1, Some(2))?; /// let expected = vec![ /// "+---+---+---+", @@ -654,7 +671,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -683,7 +700,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? ; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let d2 = df.clone(); /// let df = df.union(d2)?; /// let expected = vec![ @@ -692,7 +711,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 1 | 2 | 3 |", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -723,8 +742,13 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let d2 = df.clone().select_columns(&["b", "c", "a"])?.with_column("d", lit("77"))?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = df + /// .clone() + /// .select_columns(&["b", "c", "a"])? + /// .with_column("d", lit("77"))?; /// let df = df.union_by_name(d2)?; /// let expected = vec![ /// "+---+---+---+----+", @@ -732,7 +756,7 @@ impl DataFrame { /// "+---+---+---+----+", /// "| 1 | 2 | 3 | |", /// "| 1 | 2 | 3 | 77 |", - /// "+---+---+---+----+" + /// "+---+---+---+----+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -762,7 +786,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let d2 = df.clone(); /// let df = df.union_distinct(d2)?; /// // df2 are duplicate of df @@ -771,7 +797,7 @@ impl DataFrame { /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -802,7 +828,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let d2 = df.clone().select_columns(&["b", "c", "a"])?; /// let df = df.union_by_name_distinct(d2)?; /// let expected = vec![ @@ -810,7 +838,7 @@ impl DataFrame { /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -837,14 +865,16 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.distinct()?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -871,15 +901,17 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// // Return a single row (a, b) for each distinct value of a - /// .distinct_on(vec![col("a")], vec![col("a"), col("b")], None)?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// // Return a single row (a, b) for each distinct value of a + /// .distinct_on(vec![col("a")], vec![col("a"), col("b")], None)?; /// let expected = vec![ /// "+---+---+", /// "| a | b |", /// "+---+---+", /// "| 1 | 2 |", - /// "+---+---+" + /// "+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -1125,11 +1157,13 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.sort(vec![ - /// col("a").sort(false, true), // a DESC, nulls first - /// col("b").sort(true, false), // b ASC, nulls last - /// ])?; + /// col("a").sort(false, true), // a DESC, nulls first + /// col("b").sort(true, false), // b ASC, nulls last + /// ])?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", @@ -1176,12 +1210,17 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let left = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let right = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .select(vec![ - /// col("a").alias("a2"), - /// col("b").alias("b2"), - /// col("c").alias("c2")])?; + /// let left = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let right = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .select(vec![ + /// col("a").alias("a2"), + /// col("b").alias("b2"), + /// col("c").alias("c2"), + /// ])?; /// // Perform the equivalent of `left INNER JOIN right ON (a = a2 AND b = b2)` /// // finding all pairs of rows from `left` and `right` where `a = a2` and `b = b2`. /// let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"], None)?; @@ -1190,13 +1229,12 @@ impl DataFrame { /// "| a | b | c | a2 | b2 | c2 |", /// "+---+---+---+----+----+----+", /// "| 1 | 2 | 3 | 1 | 2 | 3 |", - /// "+---+---+---+----+----+----+" + /// "+---+---+---+----+----+----+", /// ]; /// assert_batches_sorted_eq!(expected, &join.collect().await?); /// # Ok(()) /// # } /// ``` - /// pub fn join( self, right: DataFrame, @@ -1258,7 +1296,7 @@ impl DataFrame { /// "+---+---+---+----+----+----+", /// "| a | b | c | a2 | b2 | c2 |", /// "+---+---+---+----+----+----+", - /// "+---+---+---+----+----+----+" + /// "+---+---+---+----+----+----+", /// ]; /// # assert_batches_sorted_eq!(expected, &join_on.collect().await?); /// # Ok(()) @@ -1290,7 +1328,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?; /// let expected = vec![ /// "+---+---+---+", @@ -1299,7 +1339,7 @@ impl DataFrame { /// "| 1 | 2 | 3 |", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df1.collect().await?); /// # Ok(()) @@ -1328,7 +1368,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let count = df.count().await?; // 1 /// # assert_eq!(count, 1); /// # Ok(()) @@ -1367,7 +1409,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let batches = df.collect().await?; /// # Ok(()) /// # } @@ -1387,7 +1431,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// df.show().await?; /// # Ok(()) /// # } @@ -1446,7 +1492,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// df.show_limit(10).await?; /// # Ok(()) /// # } @@ -1472,7 +1520,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let stream = df.execute_stream().await?; /// # Ok(()) /// # } @@ -1498,7 +1548,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let batches = df.collect_partitioned().await?; /// # Ok(()) /// # } @@ -1518,7 +1570,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let batches = df.execute_stream_partitioned().await?; /// # Ok(()) /// # } @@ -1547,7 +1601,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let schema = df.schema(); /// # Ok(()) /// # } @@ -1613,8 +1669,14 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let batches = df.limit(0, Some(100))?.explain(false, false)?.collect().await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let batches = df + /// .limit(0, Some(100))? + /// .explain(false, false)? + /// .collect() + /// .await?; /// # Ok(()) /// # } /// ``` @@ -1637,8 +1699,18 @@ impl DataFrame { /// # async fn main() -> Result<()> { /// use datafusion_expr::{Explain, ExplainOption}; /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let batches = df.limit(0, Some(100))?.explain_with_options(ExplainOption::default().with_verbose(false).with_analyze(false))?.collect().await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let batches = df + /// .limit(0, Some(100))? + /// .explain_with_options( + /// ExplainOption::default() + /// .with_verbose(false) + /// .with_analyze(false), + /// )? + /// .collect() + /// .await?; /// # Ok(()) /// # } /// ``` @@ -1668,7 +1740,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let f = df.registry(); /// // use f.udf("name", vec![...]) to use the udf /// # Ok(()) @@ -1687,15 +1761,19 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.intersect(d2)?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -1721,15 +1799,19 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.intersect_distinct(d2)?; /// let expected = vec![ /// "+---+---+---+", /// "| a | b | c |", /// "+---+---+---+", /// "| 1 | 2 | 3 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &df.collect().await?); /// # Ok(()) @@ -1755,8 +1837,12 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let result = df.except(d2)?; /// // those columns are not in example.csv, but in example_long.csv /// let expected = vec![ @@ -1765,7 +1851,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &result.collect().await?); /// # Ok(()) @@ -1791,8 +1877,12 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example_long.csv", CsvReadOptions::new()).await?; - /// let d2 = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new()) + /// .await?; + /// let d2 = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let result = df.except_distinct(d2)?; /// // those columns are not in example.csv, but in example_long.csv /// let expected = vec![ @@ -1801,7 +1891,7 @@ impl DataFrame { /// "+---+---+---+", /// "| 4 | 5 | 6 |", /// "| 7 | 8 | 9 |", - /// "+---+---+---+" + /// "+---+---+---+", /// ]; /// # assert_batches_sorted_eq!(expected, &result.collect().await?); /// # Ok(()) @@ -1878,13 +1968,15 @@ impl DataFrame { /// use datafusion::dataframe::DataFrameWriteOptions; /// let ctx = SessionContext::new(); /// // Sort the data by column "b" and write it to a new location - /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first - /// .write_csv( - /// "output.csv", - /// DataFrameWriteOptions::new(), - /// None, // can also specify CSV writing options here - /// ).await?; + /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .write_csv( + /// "output.csv", + /// DataFrameWriteOptions::new(), + /// None, // can also specify CSV writing options here + /// ) + /// .await?; /// # fs::remove_file("output.csv")?; /// # Ok(()) /// # } @@ -1948,13 +2040,11 @@ impl DataFrame { /// use datafusion::dataframe::DataFrameWriteOptions; /// let ctx = SessionContext::new(); /// // Sort the data by column "b" and write it to a new location - /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first - /// .write_json( - /// "output.json", - /// DataFrameWriteOptions::new(), - /// None - /// ).await?; + /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .write_json("output.json", DataFrameWriteOptions::new(), None) + /// .await?; /// # fs::remove_file("output.json")?; /// # Ok(()) /// # } @@ -2015,7 +2105,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.with_column("ab_sum", col("a") + col("b"))?; /// # Ok(()) /// # } @@ -2089,7 +2181,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.with_column_renamed("ab_sum", "total")?; /// /// # Ok(()) @@ -2222,7 +2316,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// let df = df.cache().await?; /// # Ok(()) /// # } @@ -2266,7 +2362,9 @@ impl DataFrame { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// // Fill nulls in only columns "a" and "c": /// let df = df.fill_null(ScalarValue::from(0), vec!["a".to_owned(), "c".to_owned()])?; /// // Fill nulls across all columns: @@ -2337,9 +2435,9 @@ impl DataFrame { /// Helper for creating DataFrame. /// # Example /// ``` - /// use std::sync::Arc; /// use arrow::array::{ArrayRef, Int32Array, StringArray}; /// use datafusion::prelude::DataFrame; + /// use std::sync::Arc; /// let id: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); /// let name: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz"])); /// let df = DataFrame::from_columns(vec![("id", id), ("name", name)]).unwrap(); diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index 930b4fad1d9b..cb8a6cf29541 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -42,13 +42,15 @@ impl DataFrame { /// use datafusion::dataframe::DataFrameWriteOptions; /// let ctx = SessionContext::new(); /// // Sort the data by column "b" and write it to a new location - /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await? - /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first - /// .write_parquet( - /// "output.parquet", - /// DataFrameWriteOptions::new(), - /// None, // can also specify parquet writing options here - /// ).await?; + /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await? + /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first + /// .write_parquet( + /// "output.parquet", + /// DataFrameWriteOptions::new(), + /// None, // can also specify parquet writing options here + /// ) + /// .await?; /// # fs::remove_file("output.parquet")?; /// # Ok(()) /// # } diff --git a/datafusion/core/src/execution/context/csv.rs b/datafusion/core/src/execution/context/csv.rs index 15d6d21f038a..e6f95886e91d 100644 --- a/datafusion/core/src/execution/context/csv.rs +++ b/datafusion/core/src/execution/context/csv.rs @@ -37,9 +37,16 @@ impl SessionContext { /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); /// // You can read a single file using `read_csv` - /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv("tests/data/example.csv", CsvReadOptions::new()) + /// .await?; /// // you can also read multiple files: - /// let df = ctx.read_csv(vec!["tests/data/example.csv", "tests/data/example.csv"], CsvReadOptions::new()).await?; + /// let df = ctx + /// .read_csv( + /// vec!["tests/data/example.csv", "tests/data/example.csv"], + /// CsvReadOptions::new(), + /// ) + /// .await?; /// # Ok(()) /// # } /// ``` diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 448ee5264afd..687779787ab5 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -166,22 +166,23 @@ where /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); -/// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; -/// let df = df.filter(col("a").lt_eq(col("b")))? -/// .aggregate(vec![col("a")], vec![min(col("b"))])? -/// .limit(0, Some(100))?; -/// let results = df -/// .collect() -/// .await?; +/// let df = ctx +/// .read_csv("tests/data/example.csv", CsvReadOptions::new()) +/// .await?; +/// let df = df +/// .filter(col("a").lt_eq(col("b")))? +/// .aggregate(vec![col("a")], vec![min(col("b"))])? +/// .limit(0, Some(100))?; +/// let results = df.collect().await?; /// assert_batches_eq!( -/// &[ -/// "+---+----------------+", -/// "| a | min(?table?.b) |", -/// "+---+----------------+", -/// "| 1 | 2 |", -/// "+---+----------------+", -/// ], -/// &results +/// &[ +/// "+---+----------------+", +/// "| a | min(?table?.b) |", +/// "+---+----------------+", +/// "| 1 | 2 |", +/// "+---+----------------+", +/// ], +/// &results /// ); /// # Ok(()) /// # } @@ -197,21 +198,22 @@ where /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); -/// ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?; +/// ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()) +/// .await?; /// let results = ctx -/// .sql("SELECT a, min(b) FROM example GROUP BY a LIMIT 100") -/// .await? -/// .collect() -/// .await?; +/// .sql("SELECT a, min(b) FROM example GROUP BY a LIMIT 100") +/// .await? +/// .collect() +/// .await?; /// assert_batches_eq!( -/// &[ -/// "+---+----------------+", -/// "| a | min(example.b) |", -/// "+---+----------------+", -/// "| 1 | 2 |", -/// "+---+----------------+", -/// ], -/// &results +/// &[ +/// "+---+----------------+", +/// "| a | min(example.b) |", +/// "+---+----------------+", +/// "| 1 | 2 |", +/// "+---+----------------+", +/// ], +/// &results /// ); /// # Ok(()) /// # } @@ -231,18 +233,18 @@ where /// let config = SessionConfig::new().with_batch_size(4 * 1024); /// /// // configure a memory limit of 1GB with 20% slop -/// let runtime_env = RuntimeEnvBuilder::new() +/// let runtime_env = RuntimeEnvBuilder::new() /// .with_memory_limit(1024 * 1024 * 1024, 0.80) /// .build_arc() /// .unwrap(); /// /// // Create a SessionState using the config and runtime_env /// let state = SessionStateBuilder::new() -/// .with_config(config) -/// .with_runtime_env(runtime_env) -/// // include support for built in functions and configurations -/// .with_default_features() -/// .build(); +/// .with_config(config) +/// .with_runtime_env(runtime_env) +/// // include support for built in functions and configurations +/// .with_default_features() +/// .build(); /// /// // Create a SessionContext /// let ctx = SessionContext::from(state); @@ -428,16 +430,14 @@ impl SessionContext { /// # use datafusion::prelude::*; /// # use datafusion::execution::SessionStateBuilder; /// # use datafusion_optimizer::push_down_filter::PushDownFilter; - /// let my_rule = PushDownFilter{}; // pretend it is a new rule - /// // Create a new builder with a custom optimizer rule + /// let my_rule = PushDownFilter {}; // pretend it is a new rule + /// // Create a new builder with a custom optimizer rule /// let context: SessionContext = SessionStateBuilder::new() - /// .with_optimizer_rule(Arc::new(my_rule)) - /// .build() - /// .into(); + /// .with_optimizer_rule(Arc::new(my_rule)) + /// .build() + /// .into(); /// // Enable local file access and convert context back to a builder - /// let builder = context - /// .enable_url_table() - /// .into_state_builder(); + /// let builder = context.enable_url_table().into_state_builder(); /// ``` pub fn into_state_builder(self) -> SessionStateBuilder { let SessionContext { @@ -585,11 +585,10 @@ impl SessionContext { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// ctx - /// .sql("CREATE TABLE foo (x INTEGER)") - /// .await? - /// .collect() - /// .await?; + /// ctx.sql("CREATE TABLE foo (x INTEGER)") + /// .await? + /// .collect() + /// .await?; /// assert!(ctx.table_exist("foo").unwrap()); /// # Ok(()) /// # } @@ -614,14 +613,14 @@ impl SessionContext { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let ctx = SessionContext::new(); - /// let options = SQLOptions::new() - /// .with_allow_ddl(false); - /// let err = ctx.sql_with_options("CREATE TABLE foo (x INTEGER)", options) - /// .await - /// .unwrap_err(); - /// assert!( - /// err.to_string().starts_with("Error during planning: DDL not supported: CreateMemoryTable") - /// ); + /// let options = SQLOptions::new().with_allow_ddl(false); + /// let err = ctx + /// .sql_with_options("CREATE TABLE foo (x INTEGER)", options) + /// .await + /// .unwrap_err(); + /// assert!(err + /// .to_string() + /// .starts_with("Error during planning: DDL not supported: CreateMemoryTable")); /// # Ok(()) /// # } /// ``` @@ -653,8 +652,7 @@ impl SessionContext { /// // provide type information that `a` is an Int32 /// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); /// let df_schema = DFSchema::try_from(schema).unwrap(); - /// let expr = SessionContext::new() - /// .parse_sql_expr(sql, &df_schema)?; + /// let expr = SessionContext::new().parse_sql_expr(sql, &df_schema)?; /// assert_eq!(expected, expr); /// # Ok(()) /// # } @@ -1143,8 +1141,14 @@ impl SessionContext { /// ``` /// use datafusion::execution::context::SessionContext; /// - /// assert_eq!(SessionContext::parse_memory_limit("1M").unwrap(), 1024 * 1024); - /// assert_eq!(SessionContext::parse_memory_limit("1.5G").unwrap(), (1.5 * 1024.0 * 1024.0 * 1024.0) as usize); + /// assert_eq!( + /// SessionContext::parse_memory_limit("1M").unwrap(), + /// 1024 * 1024 + /// ); + /// assert_eq!( + /// SessionContext::parse_memory_limit("1.5G").unwrap(), + /// (1.5 * 1024.0 * 1024.0 * 1024.0) as usize + /// ); /// ``` pub fn parse_memory_limit(limit: &str) -> Result { let (number, unit) = limit.split_at(limit.len() - 1); diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 561e0c363a37..2949b17537d9 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -114,12 +114,12 @@ use uuid::Uuid; /// # use std::sync::Arc; /// # #[tokio::main] /// # async fn main() -> Result<()> { -/// let state = SessionStateBuilder::new() -/// .with_config(SessionConfig::new()) -/// .with_runtime_env(Arc::new(RuntimeEnv::default())) -/// .with_default_features() -/// .build(); -/// Ok(()) +/// let state = SessionStateBuilder::new() +/// .with_config(SessionConfig::new()) +/// .with_runtime_env(Arc::new(RuntimeEnv::default())) +/// .with_default_features() +/// .build(); +/// Ok(()) /// # } /// ``` /// diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 78db28eaacc7..381dd5e9e848 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -86,26 +86,29 @@ //! let ctx = SessionContext::new(); //! //! // create the dataframe -//! let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?; +//! let df = ctx +//! .read_csv("tests/data/example.csv", CsvReadOptions::new()) +//! .await?; //! //! // create a plan -//! let df = df.filter(col("a").lt_eq(col("b")))? -//! .aggregate(vec![col("a")], vec![min(col("b"))])? -//! .limit(0, Some(100))?; +//! let df = df +//! .filter(col("a").lt_eq(col("b")))? +//! .aggregate(vec![col("a")], vec![min(col("b"))])? +//! .limit(0, Some(100))?; //! //! // execute the plan //! let results: Vec = df.collect().await?; //! //! // format the results -//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)? -//! .to_string(); +//! let pretty_results = +//! arrow::util::pretty::pretty_format_batches(&results)?.to_string(); //! //! let expected = vec![ //! "+---+----------------+", //! "| a | min(?table?.b) |", //! "+---+----------------+", //! "| 1 | 2 |", -//! "+---+----------------+" +//! "+---+----------------+", //! ]; //! //! assert_eq!(pretty_results.trim().lines().collect::>(), expected); @@ -126,24 +129,27 @@ //! # async fn main() -> Result<()> { //! let ctx = SessionContext::new(); //! -//! ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?; +//! ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()) +//! .await?; //! //! // create a plan -//! let df = ctx.sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100").await?; +//! let df = ctx +//! .sql("SELECT a, MIN(b) FROM example WHERE a <= b GROUP BY a LIMIT 100") +//! .await?; //! //! // execute the plan //! let results: Vec = df.collect().await?; //! //! // format the results -//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)? -//! .to_string(); +//! let pretty_results = +//! arrow::util::pretty::pretty_format_batches(&results)?.to_string(); //! //! let expected = vec![ //! "+---+----------------+", //! "| a | min(example.b) |", //! "+---+----------------+", //! "| 1 | 2 |", -//! "+---+----------------+" +//! "+---+----------------+", //! ]; //! //! assert_eq!(pretty_results.trim().lines().collect::>(), expected); @@ -630,7 +636,7 @@ //! └─────────────┘ ┗━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━┛ //! ─────────────────────────────────────────────────────────────▶ //! time -//!``` +//! ``` //! //! Note that DataFusion does not use [`tokio::task::spawn_blocking`] for //! CPU-bounded work, because `spawn_blocking` is designed for blocking **IO**, diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs index 2abfcd8417cb..fa8ea0b31c02 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/context_generator.rs @@ -44,7 +44,6 @@ use crate::fuzz_cases::aggregation_fuzzer::data_generator::Dataset; /// - hint `sorted` or not /// - `spilling` or not (TODO, I think a special `MemoryPool` may be needed /// to support this) -/// pub struct SessionContextGenerator { /// Current testing dataset dataset: Arc, diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs index 753a74995d8f..aaf2d1b9bad4 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs @@ -39,7 +39,6 @@ use crate::fuzz_cases::record_batch_generator::{ColumnDescr, RecordBatchGenerato /// will generate one `base dataset` firstly. Then the `base dataset` will be sorted /// based on each `sort_key` respectively. And finally `len(sort_keys) + 1` datasets /// will be returned -/// #[derive(Debug, Clone)] pub struct DatasetGeneratorConfig { /// Descriptions of columns in datasets, it's `required` @@ -115,7 +114,6 @@ impl DatasetGeneratorConfig { /// /// - Split each batch to multiple batches which each sub-batch in has the randomly `rows num`, /// and this multiple batches will be used to create the `Dataset`. -/// pub struct DatasetGenerator { batch_generator: RecordBatchGenerator, sort_keys_set: Vec>, diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs index b90b3e5e32df..1a8ef278cc29 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs @@ -253,7 +253,6 @@ impl AggregationFuzzer { /// /// - `dataset_ref`, the input dataset, store it for error reported when found /// the inconsistency between the one for `ctx` and `expected results`. -/// struct AggregationFuzzTestTask { /// Generated session context in current test case ctx_with_params: SessionContextWithParams, diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs index 209278385b7b..766e2bedd74c 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs @@ -24,7 +24,7 @@ use rand::{rng, seq::SliceRandom, Rng}; /// Creates queries like /// ```sql /// SELECT AGG(..) FROM table_name GROUP BY -///``` +/// ``` #[derive(Debug, Default, Clone)] pub struct QueryBuilder { // =================================== @@ -95,7 +95,6 @@ pub struct QueryBuilder { /// More details can see [`GroupOrdering`]. /// /// [`GroupOrdering`]: datafusion_physical_plan::aggregates::order::GroupOrdering - /// dataset_sort_keys: Vec>, /// If we will also test the no grouping case like: @@ -103,7 +102,6 @@ pub struct QueryBuilder { /// ```text /// SELECT aggr FROM t; /// ``` - /// no_grouping: bool, // ==================================== diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index e212ee269b15..743c8750b521 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -43,7 +43,6 @@ use tempfile::TempDir; /// A macro to assert that some particular line contains two substrings /// /// Usage: `assert_metrics!(actual, operator_name, metrics)` -/// macro_rules! assert_metrics { ($ACTUAL: expr, $OPERATOR_NAME: expr, $METRICS: expr) => { let found = $ACTUAL diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index f0bf15d3483b..ffe0ba021edb 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -56,7 +56,6 @@ //! //! The same answer can be produced by simply keeping track of the top //! N elements, reducing the total amount of required buffer memory. -//! use std::fmt::Debug; use std::hash::Hash; From 8b6c97f00b57c5a75bf505825f497dc2fd93955e Mon Sep 17 00:00:00 2001 From: Yu-Chuan Hung <86523891+CuteChuanChuan@users.noreply.github.com> Date: Thu, 30 Oct 2025 00:49:06 +0800 Subject: [PATCH 042/157] chore: Format examples in doc strings - expr (#18340) ## Which issue does this PR close? Part of #16915 ## Rationale for this change Format code examples in documentation comments to improve readability and maintain consistent code style across the codebase. This is part of a multi-PR effort to format all doc comment examples and eventually enable CI checks to enforce this formatting. ## What changes are included in this PR? Run `cargo fmt -p -- --config format_code_in_doc_comments=true` for the following datasource-related crates: - `datafusion-expr` - `datafusion-expr-common` ## Are these changes tested? No testing needed - this is purely a formatting change with no functional modifications. ## Are there any user-facing changes? No - this only affects documentation formatting. --- .../expr-common/src/interval_arithmetic.rs | 63 ++++++++------- datafusion/expr-common/src/signature.rs | 28 +++---- datafusion/expr/src/expr.rs | 60 +++++++-------- datafusion/expr/src/expr_schema.rs | 21 ++--- datafusion/expr/src/logical_plan/builder.rs | 15 ++-- datafusion/expr/src/logical_plan/display.rs | 12 +-- datafusion/expr/src/logical_plan/extension.rs | 28 +++---- datafusion/expr/src/logical_plan/plan.rs | 77 ++++++++++--------- datafusion/expr/src/select_expr.rs | 6 +- datafusion/expr/src/udf.rs | 9 +-- datafusion/expr/src/utils.rs | 21 +---- datafusion/expr/src/window_frame.rs | 1 - 12 files changed, 168 insertions(+), 173 deletions(-) diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index b5b632076b00..40c44cfb3ca2 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -1670,22 +1670,23 @@ fn cast_scalar_value( /// /// // [1, 2) U {NULL} /// let maybe_null = NullableInterval::MaybeNull { -/// values: Interval::try_new( -/// ScalarValue::Int32(Some(1)), -/// ScalarValue::Int32(Some(2)), -/// ).unwrap(), +/// values: Interval::try_new( +/// ScalarValue::Int32(Some(1)), +/// ScalarValue::Int32(Some(2)), +/// ) +/// .unwrap(), /// }; /// /// // (0, ∞) /// let not_null = NullableInterval::NotNull { -/// values: Interval::try_new( -/// ScalarValue::Int32(Some(0)), -/// ScalarValue::Int32(None), -/// ).unwrap(), +/// values: Interval::try_new(ScalarValue::Int32(Some(0)), ScalarValue::Int32(None)) +/// .unwrap(), /// }; /// /// // {NULL} -/// let null_interval = NullableInterval::Null { datatype: DataType::Int32 }; +/// let null_interval = NullableInterval::Null { +/// datatype: DataType::Int32, +/// }; /// /// // {4} /// let single_value = NullableInterval::from(ScalarValue::Int32(Some(4))); @@ -1787,22 +1788,26 @@ impl NullableInterval { /// /// ``` /// use datafusion_common::ScalarValue; - /// use datafusion_expr_common::operator::Operator; /// use datafusion_expr_common::interval_arithmetic::Interval; /// use datafusion_expr_common::interval_arithmetic::NullableInterval; + /// use datafusion_expr_common::operator::Operator; /// /// // 4 > 3 -> true /// let lhs = NullableInterval::from(ScalarValue::Int32(Some(4))); /// let rhs = NullableInterval::from(ScalarValue::Int32(Some(3))); /// let result = lhs.apply_operator(&Operator::Gt, &rhs).unwrap(); - /// assert_eq!(result, NullableInterval::from(ScalarValue::Boolean(Some(true)))); + /// assert_eq!( + /// result, + /// NullableInterval::from(ScalarValue::Boolean(Some(true))) + /// ); /// /// // [1, 3) > NULL -> NULL /// let lhs = NullableInterval::NotNull { /// values: Interval::try_new( - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(Some(3)), - /// ).unwrap(), + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(Some(3)), + /// ) + /// .unwrap(), /// }; /// let rhs = NullableInterval::from(ScalarValue::Int32(None)); /// let result = lhs.apply_operator(&Operator::Gt, &rhs).unwrap(); @@ -1811,22 +1816,27 @@ impl NullableInterval { /// // [1, 3] > [2, 4] -> [false, true] /// let lhs = NullableInterval::NotNull { /// values: Interval::try_new( - /// ScalarValue::Int32(Some(1)), - /// ScalarValue::Int32(Some(3)), - /// ).unwrap(), + /// ScalarValue::Int32(Some(1)), + /// ScalarValue::Int32(Some(3)), + /// ) + /// .unwrap(), /// }; /// let rhs = NullableInterval::NotNull { - /// values: Interval::try_new( - /// ScalarValue::Int32(Some(2)), - /// ScalarValue::Int32(Some(4)), - /// ).unwrap(), + /// values: Interval::try_new( + /// ScalarValue::Int32(Some(2)), + /// ScalarValue::Int32(Some(4)), + /// ) + /// .unwrap(), /// }; /// let result = lhs.apply_operator(&Operator::Gt, &rhs).unwrap(); /// // Both inputs are valid (non-null), so result must be non-null - /// assert_eq!(result, NullableInterval::NotNull { - /// // Uncertain whether inequality is true or false - /// values: Interval::UNCERTAIN, - /// }); + /// assert_eq!( + /// result, + /// NullableInterval::NotNull { + /// // Uncertain whether inequality is true or false + /// values: Interval::UNCERTAIN, + /// } + /// ); /// ``` pub fn apply_operator(&self, op: &Operator, rhs: &Self) -> Result { match op { @@ -1924,7 +1934,8 @@ impl NullableInterval { /// values: Interval::try_new( /// ScalarValue::Int32(Some(1)), /// ScalarValue::Int32(Some(4)), - /// ).unwrap(), + /// ) + /// .unwrap(), /// }; /// assert_eq!(interval.single_value(), None); /// ``` diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 38eef077c5af..5cb7a17ee312 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -127,11 +127,10 @@ pub enum Arity { /// ``` /// # use arrow::datatypes::DataType; /// # use datafusion_expr_common::signature::{TypeSignature}; -/// // Declares the function must be invoked with a single argument of type `Utf8View`. -/// // if a user calls the function with `Utf8` or `LargeUtf8`, DataFusion will -/// // automatically add a cast to `Utf8View` during planning. -/// let type_signature = TypeSignature::Exact(vec![DataType::Utf8View]); -/// +/// // Declares the function must be invoked with a single argument of type `Utf8View`. +/// // if a user calls the function with `Utf8` or `LargeUtf8`, DataFusion will +/// // automatically add a cast to `Utf8View` during planning. +/// let type_signature = TypeSignature::Exact(vec![DataType::Utf8View]); /// ``` /// /// # Example: Timestamps @@ -144,11 +143,11 @@ pub enum Arity { /// # use arrow::datatypes::{DataType, TimeUnit}; /// # use datafusion_expr_common::signature::{TIMEZONE_WILDCARD, TypeSignature}; /// let type_signature = TypeSignature::Exact(vec![ -/// // A nanosecond precision timestamp with ANY timezone -/// // matches Timestamp(Nanosecond, Some("+0:00")) -/// // matches Timestamp(Nanosecond, Some("+5:00")) -/// // does not match Timestamp(Nanosecond, None) -/// DataType::Timestamp(TimeUnit::Nanosecond, Some(TIMEZONE_WILDCARD.into())), +/// // A nanosecond precision timestamp with ANY timezone +/// // matches Timestamp(Nanosecond, Some("+0:00")) +/// // matches Timestamp(Nanosecond, Some("+5:00")) +/// // does not match Timestamp(Nanosecond, None) +/// DataType::Timestamp(TimeUnit::Nanosecond, Some(TIMEZONE_WILDCARD.into())), /// ]); /// ``` #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] @@ -858,8 +857,8 @@ fn get_data_types(native_type: &NativeType) -> Vec { /// # Examples /// /// ``` +/// use datafusion_common::types::{logical_binary, logical_string, NativeType}; /// use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; -/// use datafusion_common::types::{NativeType, logical_binary, logical_string}; /// /// // Exact coercion that only accepts timestamp types /// let exact = Coercion::new_exact(TypeSignatureClass::Timestamp); @@ -868,7 +867,7 @@ fn get_data_types(native_type: &NativeType) -> Vec { /// let implicit = Coercion::new_implicit( /// TypeSignatureClass::Native(logical_string()), /// vec![TypeSignatureClass::Native(logical_binary())], -/// NativeType::String +/// NativeType::String, /// ); /// ``` /// @@ -1275,8 +1274,9 @@ impl Signature { /// ``` /// # use datafusion_expr_common::signature::{Signature, Volatility}; /// # use arrow::datatypes::DataType; - /// let sig = Signature::exact(vec![DataType::Int32, DataType::Utf8], Volatility::Immutable) - /// .with_parameter_names(vec!["count".to_string(), "name".to_string()]); + /// let sig = + /// Signature::exact(vec![DataType::Int32, DataType::Utf8], Volatility::Immutable) + /// .with_parameter_names(vec!["count".to_string(), "name".to_string()]); /// ``` /// /// # Errors diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 6077b3c1e5bb..94dcd2a86150 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -164,11 +164,11 @@ impl From for NullTreatment { /// # use datafusion_expr::{lit, col, Operator, Expr}; /// // Use the `+` operator to add two columns together /// let expr = col("c1") + col("c2"); -/// assert!(matches!(expr, Expr::BinaryExpr { ..} )); +/// assert!(matches!(expr, Expr::BinaryExpr { .. })); /// if let Expr::BinaryExpr(binary_expr) = expr { -/// assert_eq!(*binary_expr.left, col("c1")); -/// assert_eq!(*binary_expr.right, col("c2")); -/// assert_eq!(binary_expr.op, Operator::Plus); +/// assert_eq!(*binary_expr.left, col("c1")); +/// assert_eq!(*binary_expr.right, col("c2")); +/// assert_eq!(binary_expr.op, Operator::Plus); /// } /// ``` /// @@ -179,12 +179,12 @@ impl From for NullTreatment { /// # use datafusion_common::ScalarValue; /// # use datafusion_expr::{lit, col, Operator, Expr}; /// let expr = col("c1").eq(lit(42_i32)); -/// assert!(matches!(expr, Expr::BinaryExpr { .. } )); +/// assert!(matches!(expr, Expr::BinaryExpr { .. })); /// if let Expr::BinaryExpr(binary_expr) = expr { -/// assert_eq!(*binary_expr.left, col("c1")); -/// let scalar = ScalarValue::Int32(Some(42)); -/// assert_eq!(*binary_expr.right, Expr::Literal(scalar, None)); -/// assert_eq!(binary_expr.op, Operator::Eq); +/// assert_eq!(*binary_expr.left, col("c1")); +/// let scalar = ScalarValue::Int32(Some(42)); +/// assert_eq!(*binary_expr.right, Expr::Literal(scalar, None)); +/// assert_eq!(binary_expr.op, Operator::Eq); /// } /// ``` /// @@ -197,22 +197,22 @@ impl From for NullTreatment { /// # use datafusion_expr::Expr; /// // Create a schema c1(int, c2 float) /// let arrow_schema = Schema::new(vec![ -/// Field::new("c1", DataType::Int32, false), -/// Field::new("c2", DataType::Float64, false), +/// Field::new("c1", DataType::Int32, false), +/// Field::new("c2", DataType::Float64, false), /// ]); /// // DFSchema is a an Arrow schema with optional relation name -/// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema) -/// .unwrap(); +/// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema).unwrap(); /// /// // Form Vec with an expression for each column in the schema -/// let exprs: Vec<_> = df_schema.iter() -/// .map(Expr::from) -/// .collect(); -/// -/// assert_eq!(exprs, vec![ -/// Expr::from(Column::from_qualified_name("t1.c1")), -/// Expr::from(Column::from_qualified_name("t1.c2")), -/// ]); +/// let exprs: Vec<_> = df_schema.iter().map(Expr::from).collect(); +/// +/// assert_eq!( +/// exprs, +/// vec![ +/// Expr::from(Column::from_qualified_name("t1.c1")), +/// Expr::from(Column::from_qualified_name("t1.c2")), +/// ] +/// ); /// ``` /// /// # Examples: Displaying `Exprs` @@ -273,12 +273,13 @@ impl From for NullTreatment { /// let mut scalars = HashSet::new(); /// // apply recursively visits all nodes in the expression tree /// expr.apply(|e| { -/// if let Expr::Literal(scalar, _) = e { -/// scalars.insert(scalar); -/// } -/// // The return value controls whether to continue visiting the tree -/// Ok(TreeNodeRecursion::Continue) -/// }).unwrap(); +/// if let Expr::Literal(scalar, _) = e { +/// scalars.insert(scalar); +/// } +/// // The return value controls whether to continue visiting the tree +/// Ok(TreeNodeRecursion::Continue) +/// }) +/// .unwrap(); /// // All subtrees have been visited and literals found /// assert_eq!(scalars.len(), 2); /// assert!(scalars.contains(&ScalarValue::Int32(Some(5)))); @@ -1640,7 +1641,6 @@ impl Expr { /// let metadata = FieldMetadata::from(metadata); /// let expr = col("foo").alias_with_metadata("bar", Some(metadata)); /// ``` - /// pub fn alias_with_metadata( self, name: impl Into, @@ -1670,9 +1670,9 @@ impl Expr { /// # use datafusion_common::metadata::FieldMetadata; /// let metadata = HashMap::from([("key".to_string(), "value".to_string())]); /// let metadata = FieldMetadata::from(metadata); - /// let expr = col("foo").alias_qualified_with_metadata(Some("tbl"), "bar", Some(metadata)); + /// let expr = + /// col("foo").alias_qualified_with_metadata(Some("tbl"), "bar", Some(metadata)); /// ``` - /// pub fn alias_qualified_with_metadata( self, relation: Option>, diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 8c557a5630f0..9e8d6080b82c 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -82,15 +82,17 @@ impl ExprSchemable for Expr { /// # use std::collections::HashMap; /// /// fn main() { - /// let expr = col("c1") + col("c2"); - /// let schema = DFSchema::from_unqualified_fields( - /// vec![ - /// Field::new("c1", DataType::Int32, true), - /// Field::new("c2", DataType::Float32, true), - /// ].into(), - /// HashMap::new(), - /// ).unwrap(); - /// assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); + /// let expr = col("c1") + col("c2"); + /// let schema = DFSchema::from_unqualified_fields( + /// vec![ + /// Field::new("c1", DataType::Int32, true), + /// Field::new("c2", DataType::Float32, true), + /// ] + /// .into(), + /// HashMap::new(), + /// ) + /// .unwrap(); + /// assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); /// } /// ``` /// @@ -734,7 +736,6 @@ impl Expr { /// new projection with the casted expression. /// 2. **Non-projection plan**: If the subquery isn't a projection, it adds a projection to the plan /// with the casted first column. -/// pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result { if subquery.subquery.schema().field(0).data_type() == cast_to_type { return Ok(subquery); diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index a430add3f786..b9afd894d77d 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -450,14 +450,13 @@ impl LogicalPlanBuilder { /// # ])) as _; /// # let table_source = Arc::new(LogicalTableSource::new(employee_schema)); /// // VALUES (1), (2) - /// let input = LogicalPlanBuilder::values(vec![vec![lit(1)], vec![lit(2)]])? - /// .build()?; + /// let input = LogicalPlanBuilder::values(vec![vec![lit(1)], vec![lit(2)]])?.build()?; /// // INSERT INTO MyTable VALUES (1), (2) /// let insert_plan = LogicalPlanBuilder::insert_into( - /// input, - /// "MyTable", - /// table_source, - /// InsertOp::Append, + /// input, + /// "MyTable", + /// table_source, + /// InsertOp::Append, /// )?; /// # Ok(()) /// # } @@ -953,8 +952,8 @@ impl LogicalPlanBuilder { /// // Form the expression `(left.a != right.a)` AND `(left.b != right.b)` /// let exprs = vec![ /// col("left.a").eq(col("right.a")), - /// col("left.b").not_eq(col("right.b")) - /// ]; + /// col("left.b").not_eq(col("right.b")), + /// ]; /// /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b != b2)` /// // finding all pairs of rows from `left` and `right` where diff --git a/datafusion/expr/src/logical_plan/display.rs b/datafusion/expr/src/logical_plan/display.rs index ea08c223e8f4..b60126335598 100644 --- a/datafusion/expr/src/logical_plan/display.rs +++ b/datafusion/expr/src/logical_plan/display.rs @@ -94,17 +94,17 @@ impl<'n> TreeNodeVisitor<'n> for IndentVisitor<'_, '_> { /// `foo:Utf8;N` if `foo` is nullable. /// /// ``` -/// use arrow::datatypes::{Field, Schema, DataType}; +/// use arrow::datatypes::{DataType, Field, Schema}; /// # use datafusion_expr::logical_plan::display_schema; /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), /// Field::new("first_name", DataType::Utf8, true), -/// ]); +/// ]); /// -/// assert_eq!( -/// "[id:Int32, first_name:Utf8;N]", -/// format!("{}", display_schema(&schema)) -/// ); +/// assert_eq!( +/// "[id:Int32, first_name:Utf8;N]", +/// format!("{}", display_schema(&schema)) +/// ); /// ``` pub fn display_schema(schema: &Schema) -> impl fmt::Display + '_ { struct Wrapper<'a>(&'a Schema); diff --git a/datafusion/expr/src/logical_plan/extension.rs b/datafusion/expr/src/logical_plan/extension.rs index a8ee7885644a..fe324d40fd95 100644 --- a/datafusion/expr/src/logical_plan/extension.rs +++ b/datafusion/expr/src/logical_plan/extension.rs @@ -39,10 +39,10 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync { /// # struct Dummy { } /// /// # impl Dummy { - /// // canonical boiler plate - /// fn as_any(&self) -> &dyn Any { - /// self - /// } + /// // canonical boiler plate + /// fn as_any(&self) -> &dyn Any { + /// self + /// } /// # } /// ``` fn as_any(&self) -> &dyn Any; @@ -131,18 +131,18 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync { /// // User defined node that derives Hash /// #[derive(Hash, Debug, PartialEq, Eq)] /// struct MyNode { - /// val: u64 + /// val: u64, /// } /// /// // impl UserDefinedLogicalNode { /// // ... /// # impl MyNode { - /// // Boiler plate to call the derived Hash impl - /// fn dyn_hash(&self, state: &mut dyn std::hash::Hasher) { + /// // Boiler plate to call the derived Hash impl + /// fn dyn_hash(&self, state: &mut dyn std::hash::Hasher) { /// use std::hash::Hash; /// let mut s = state; /// self.hash(&mut s); - /// } + /// } /// // } /// # } /// ``` @@ -169,19 +169,19 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync { /// // User defined node that derives Eq /// #[derive(Hash, Debug, PartialEq, Eq)] /// struct MyNode { - /// val: u64 + /// val: u64, /// } /// /// // impl UserDefinedLogicalNode { /// // ... /// # impl MyNode { - /// // Boiler plate to call the derived Eq impl - /// fn dyn_eq(&self, other: &dyn UserDefinedLogicalNode) -> bool { + /// // Boiler plate to call the derived Eq impl + /// fn dyn_eq(&self, other: &dyn UserDefinedLogicalNode) -> bool { /// match other.as_any().downcast_ref::() { - /// Some(o) => self == o, - /// None => false, + /// Some(o) => self == o, + /// None => false, /// } - /// } + /// } /// // } /// # } /// ``` diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 9541f35e3062..0f0d81186d68 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -203,7 +203,6 @@ pub use datafusion_common::{JoinConstraint, JoinType}; /// # Ok(()) /// # } /// ``` -/// #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] pub enum LogicalPlan { /// Evaluates an arbitrary list of expressions (essentially a @@ -1267,7 +1266,6 @@ impl LogicalPlan { /// \n TableScan: t1", /// plan.display_indent().to_string() /// ); - /// /// ``` pub fn with_param_values( self, @@ -1561,20 +1559,20 @@ impl LogicalPlan { /// ``` /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; - /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan}; - /// let schema = Schema::new(vec![ - /// Field::new("id", DataType::Int32, false), - /// ]); - /// let plan = table_scan(Some("t1"), &schema, None).unwrap() - /// .filter(col("id").eq(lit(5))).unwrap() - /// .build().unwrap(); + /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder}; + /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + /// let plan = table_scan(Some("t1"), &schema, None) + /// .unwrap() + /// .filter(col("id").eq(lit(5))) + /// .unwrap() + /// .build() + /// .unwrap(); /// /// // Format using display_indent /// let display_string = format!("{}", plan.display_indent()); /// - /// assert_eq!("Filter: t1.id = Int32(5)\n TableScan: t1", - /// display_string); + /// assert_eq!("Filter: t1.id = Int32(5)\n TableScan: t1", display_string); /// ``` pub fn display_indent(&self) -> impl Display + '_ { // Boilerplate structure to wrap LogicalPlan with something @@ -1603,21 +1601,24 @@ impl LogicalPlan { /// ``` /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; - /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan}; - /// let schema = Schema::new(vec![ - /// Field::new("id", DataType::Int32, false), - /// ]); - /// let plan = table_scan(Some("t1"), &schema, None).unwrap() - /// .filter(col("id").eq(lit(5))).unwrap() - /// .build().unwrap(); + /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder}; + /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + /// let plan = table_scan(Some("t1"), &schema, None) + /// .unwrap() + /// .filter(col("id").eq(lit(5))) + /// .unwrap() + /// .build() + /// .unwrap(); /// /// // Format using display_indent_schema /// let display_string = format!("{}", plan.display_indent_schema()); /// - /// assert_eq!("Filter: t1.id = Int32(5) [id:Int32]\ + /// assert_eq!( + /// "Filter: t1.id = Int32(5) [id:Int32]\ /// \n TableScan: t1 [id:Int32]", - /// display_string); + /// display_string + /// ); /// ``` pub fn display_indent_schema(&self) -> impl Display + '_ { // Boilerplate structure to wrap LogicalPlan with something @@ -1665,14 +1666,15 @@ impl LogicalPlan { /// structure, and one with additional details such as schema. /// /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; - /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan}; - /// let schema = Schema::new(vec![ - /// Field::new("id", DataType::Int32, false), - /// ]); - /// let plan = table_scan(Some("t1"), &schema, None).unwrap() - /// .filter(col("id").eq(lit(5))).unwrap() - /// .build().unwrap(); + /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder}; + /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + /// let plan = table_scan(Some("t1"), &schema, None) + /// .unwrap() + /// .filter(col("id").eq(lit(5))) + /// .unwrap() + /// .build() + /// .unwrap(); /// /// // Format using display_graphviz /// let graphviz_string = format!("{}", plan.display_graphviz()); @@ -1684,7 +1686,6 @@ impl LogicalPlan { /// ```bash /// dot -Tpdf < /tmp/example.dot > /tmp/example.pdf /// ``` - /// pub fn display_graphviz(&self) -> impl Display + '_ { // Boilerplate structure to wrap LogicalPlan with something // that that can be formatted @@ -1723,13 +1724,13 @@ impl LogicalPlan { /// Projection: id /// ``` /// ``` - /// use arrow::datatypes::{Field, Schema, DataType}; - /// use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan}; - /// let schema = Schema::new(vec![ - /// Field::new("id", DataType::Int32, false), - /// ]); - /// let plan = table_scan(Some("t1"), &schema, None).unwrap() - /// .build().unwrap(); + /// use arrow::datatypes::{DataType, Field, Schema}; + /// use datafusion_expr::{col, lit, logical_plan::table_scan, LogicalPlanBuilder}; + /// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + /// let plan = table_scan(Some("t1"), &schema, None) + /// .unwrap() + /// .build() + /// .unwrap(); /// /// // Format using display /// let display_string = format!("{}", plan.display()); diff --git a/datafusion/expr/src/select_expr.rs b/datafusion/expr/src/select_expr.rs index 039df20f397b..bfec4c5844d0 100644 --- a/datafusion/expr/src/select_expr.rs +++ b/datafusion/expr/src/select_expr.rs @@ -44,10 +44,8 @@ use crate::{expr::WildcardOptions, Expr}; /// let wildcard = SelectExpr::Wildcard(WildcardOptions::default()); /// /// // SELECT mytable.* -/// let qualified = SelectExpr::QualifiedWildcard( -/// "mytable".into(), -/// WildcardOptions::default() -/// ); +/// let qualified = +/// SelectExpr::QualifiedWildcard("mytable".into(), WildcardOptions::default()); /// /// // SELECT col1 /// let expr = SelectExpr::Expression(col("col1").into()); diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index c4cd8c006d1f..fd54bb13a62f 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -568,7 +568,6 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// /// * `Some(ScalarUDF)` - A new instance of this function configured with the new settings /// * `None` - If this function does not change with new configuration settings (the default) - /// fn with_updated_config(&self, _config: &ConfigOptions) -> Option { None } @@ -604,10 +603,10 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// # struct Example{} /// # impl Example { /// fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { - /// // report output is only nullable if any one of the arguments are nullable - /// let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); - /// let field = Arc::new(Field::new("ignored_name", DataType::Int32, true)); - /// Ok(field) + /// // report output is only nullable if any one of the arguments are nullable + /// let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); + /// let field = Arc::new(Field::new("ignored_name", DataType::Int32, true)); + /// Ok(field) /// } /// # } /// ``` diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 74ba99847f70..cd733e0a130a 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -890,7 +890,6 @@ pub fn check_all_columns_from_schema( /// all referenced column of the right side is from the right schema. /// 2. Or opposite. All referenced column of the left side is from the right schema, /// and the right side is from the left schema. -/// pub fn find_valid_equijoin_key_pair( left_key: &Expr, right_key: &Expr, @@ -1034,10 +1033,7 @@ pub fn iter_conjunction_owned(expr: Expr) -> impl Iterator { /// let expr = col("a").eq(lit(1)).and(col("b").eq(lit(2))); /// /// // [a=1, b=2] -/// let split = vec![ -/// col("a").eq(lit(1)), -/// col("b").eq(lit(2)), -/// ]; +/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))]; /// /// // use split_conjunction_owned to split them /// assert_eq!(split_conjunction_owned(expr), split); @@ -1060,10 +1056,7 @@ pub fn split_conjunction_owned(expr: Expr) -> Vec { /// let expr = col("a").eq(lit(1)).add(col("b").eq(lit(2))); /// /// // [a=1, b=2] -/// let split = vec![ -/// col("a").eq(lit(1)), -/// col("b").eq(lit(2)), -/// ]; +/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))]; /// /// // use split_binary_owned to split them /// assert_eq!(split_binary_owned(expr, Operator::Plus), split); @@ -1131,10 +1124,7 @@ fn split_binary_impl<'a>( /// let expr = col("a").eq(lit(1)).and(col("b").eq(lit(2))); /// /// // [a=1, b=2] -/// let split = vec![ -/// col("a").eq(lit(1)), -/// col("b").eq(lit(2)), -/// ]; +/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))]; /// /// // use conjunction to join them together with `AND` /// assert_eq!(conjunction(split), Some(expr)); @@ -1157,10 +1147,7 @@ pub fn conjunction(filters: impl IntoIterator) -> Option { /// let expr = col("a").eq(lit(1)).or(col("b").eq(lit(2))); /// /// // [a=1, b=2] -/// let split = vec![ -/// col("a").eq(lit(1)), -/// col("b").eq(lit(2)), -/// ]; +/// let split = vec![col("a").eq(lit(1)), col("b").eq(lit(2))]; /// /// // use disjunction to join them together with `OR` /// assert_eq!(disjunction(split), Some(expr)); diff --git a/datafusion/expr/src/window_frame.rs b/datafusion/expr/src/window_frame.rs index f72dc10a6950..5fb2916c34e9 100644 --- a/datafusion/expr/src/window_frame.rs +++ b/datafusion/expr/src/window_frame.rs @@ -307,7 +307,6 @@ impl WindowFrame { /// 3. CURRENT ROW /// 4. `` FOLLOWING /// 5. UNBOUNDED FOLLOWING -/// #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] pub enum WindowFrameBound { /// 1. UNBOUNDED PRECEDING From 7f6a606b8ace053c871d79c2ee5b5b8ae21e44b9 Mon Sep 17 00:00:00 2001 From: Yu-Chuan Hung <86523891+CuteChuanChuan@users.noreply.github.com> Date: Thu, 30 Oct 2025 00:49:15 +0800 Subject: [PATCH 043/157] chore: Format examples in doc strings - datasource crates (#18338) ## Which issue does this PR close? Part of #16915 ## Rationale for this change Format code examples in documentation comments to improve readability and maintain consistent code style across the codebase. This is part of a multi-PR effort to format all doc comment examples and eventually enable CI checks to enforce this formatting. ## What changes are included in this PR? Run `cargo fmt -p -- --config format_code_in_doc_comments=true` for the following datasource-related crates: - `datafusion-datasource` - `datafusion-datasource-arrow` - `datafusion-datasource-avro` - `datafusion-datasource-csv` - `datafusion-datasource-json` - `datafusion-datasource-parquet` ## Are these changes tested? No testing needed - this is purely a formatting change with no functional modifications. ## Are there any user-facing changes? No - this only affects documentation formatting. --------- Co-authored-by: Andrew Lamb --- .../src/avro_to_arrow/reader.rs | 8 +-- .../datasource-parquet/src/page_filter.rs | 1 - datafusion/datasource-parquet/src/source.rs | 2 - datafusion/datasource/src/file_scan_config.rs | 68 +++++++++---------- datafusion/datasource/src/mod.rs | 1 - datafusion/datasource/src/url.rs | 1 - datafusion/datasource/src/write/mod.rs | 12 +++- 7 files changed, 46 insertions(+), 47 deletions(-) diff --git a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs b/datafusion/datasource-avro/src/avro_to_arrow/reader.rs index 9a4d13fc191d..5ef35e2bee89 100644 --- a/datafusion/datasource-avro/src/avro_to_arrow/reader.rs +++ b/datafusion/datasource-avro/src/avro_to_arrow/reader.rs @@ -64,13 +64,9 @@ impl ReaderBuilder { /// let file = File::open("test/data/basic.avro").unwrap(); /// /// // create a builder, inferring the schema with the first 100 records - /// let builder = ReaderBuilder::new() - /// .read_schema() - /// .with_batch_size(100); + /// let builder = ReaderBuilder::new().read_schema().with_batch_size(100); /// - /// let reader = builder - /// .build::(file) - /// .unwrap(); + /// let reader = builder.build::(file).unwrap(); /// /// reader /// } diff --git a/datafusion/datasource-parquet/src/page_filter.rs b/datafusion/datasource-parquet/src/page_filter.rs index 65d1affb44a9..82deedd406ce 100644 --- a/datafusion/datasource-parquet/src/page_filter.rs +++ b/datafusion/datasource-parquet/src/page_filter.rs @@ -90,7 +90,6 @@ use parquet::{ /// ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━┛ /// /// Total rows: 300 -/// /// ``` /// /// Given the predicate `A > 35 AND B = 'F'`: diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 186d922fc373..b7c29f615a19 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -85,7 +85,6 @@ use parquet::encryption::decrypt::FileDecryptionProperties; /// │.───────────────────.│ /// │ ) /// `───────────────────' -/// /// ``` /// /// # Example: Create a `DataSourceExec` @@ -349,7 +348,6 @@ impl ParquetSource { } /// Optional user defined parquet file reader factory. - /// pub fn with_parquet_file_reader_factory( mut self, parquet_file_reader_factory: Arc, diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index c52397d9a7cc..072922eb8920 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -1388,25 +1388,25 @@ fn create_output_array( /// correctly sorted on `(A, B, C)` /// /// ```text -///┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┓ -/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐ -///┃ ┌───────────────┐ ┌──────────────┐ │ ┌──────────────┐ │ ┌─────────────┐ ┃ -/// │ │ 1.parquet │ │ │ │ 2.parquet │ │ │ 3.parquet │ │ │ 4.parquet │ │ -///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ │Sort: A, B, C │ │ │Sort: A, B, C│ ┃ -/// │ └───────────────┘ │ │ └──────────────┘ │ └──────────────┘ │ └─────────────┘ │ -///┃ │ │ ┃ -/// │ │ │ │ │ │ -///┃ │ │ ┃ -/// │ │ │ │ │ │ -///┃ │ │ ┃ -/// │ │ │ │ │ │ -///┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ -/// DataFusion DataFusion DataFusion DataFusion -///┃ Partition 1 Partition 2 Partition 3 Partition 4 ┃ -/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ +/// ┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┓ +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ┐ +/// ┃ ┌───────────────┐ ┌──────────────┐ │ ┌──────────────┐ │ ┌─────────────┐ ┃ +/// │ │ 1.parquet │ │ │ │ 2.parquet │ │ │ 3.parquet │ │ │ 4.parquet │ │ +/// ┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ │Sort: A, B, C │ │ │Sort: A, B, C│ ┃ +/// │ └───────────────┘ │ │ └──────────────┘ │ └──────────────┘ │ └─────────────┘ │ +/// ┃ │ │ ┃ +/// │ │ │ │ │ │ +/// ┃ │ │ ┃ +/// │ │ │ │ │ │ +/// ┃ │ │ ┃ +/// │ │ │ │ │ │ +/// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ +/// DataFusion DataFusion DataFusion DataFusion +/// ┃ Partition 1 Partition 2 Partition 3 Partition 4 ┃ +/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ /// /// DataSourceExec -///``` +/// ``` /// /// However, when more than 1 file is assigned to each partition, each /// partition is NOT correctly sorted on `(A, B, C)`. Once the second @@ -1414,25 +1414,25 @@ fn create_output_array( /// the same sorted stream /// ///```text -///┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ -/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ -///┃ ┌───────────────┐ ┌──────────────┐ │ -/// │ │ 1.parquet │ │ │ │ 2.parquet │ ┃ -///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ -/// │ └───────────────┘ │ │ └──────────────┘ ┃ -///┃ ┌───────────────┐ ┌──────────────┐ │ -/// │ │ 3.parquet │ │ │ │ 4.parquet │ ┃ -///┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ -/// │ └───────────────┘ │ │ └──────────────┘ ┃ -///┃ │ -/// │ │ │ ┃ -///┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ -/// DataFusion DataFusion ┃ -///┃ Partition 1 Partition 2 -/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┛ +/// ┏ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃ +/// ┃ ┌───────────────┐ ┌──────────────┐ │ +/// │ │ 1.parquet │ │ │ │ 2.parquet │ ┃ +/// ┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ +/// │ └───────────────┘ │ │ └──────────────┘ ┃ +/// ┃ ┌───────────────┐ ┌──────────────┐ │ +/// │ │ 3.parquet │ │ │ │ 4.parquet │ ┃ +/// ┃ │ Sort: A, B, C │ │Sort: A, B, C │ │ +/// │ └───────────────┘ │ │ └──────────────┘ ┃ +/// ┃ │ +/// │ │ │ ┃ +/// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ +/// DataFusion DataFusion ┃ +/// ┃ Partition 1 Partition 2 +/// ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┛ /// /// DataSourceExec -///``` +/// ``` fn get_projected_output_ordering( base_config: &FileScanConfig, projected_schema: &SchemaRef, diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs index 80b44ad5949a..8d988bdb31be 100644 --- a/datafusion/datasource/src/mod.rs +++ b/datafusion/datasource/src/mod.rs @@ -310,7 +310,6 @@ pub async fn calculate_range( /// Returns a `Result` wrapping a `usize` that represents the position of the first newline character found within the specified range. If no newline is found, it returns the length of the scanned data, effectively indicating the end of the range. /// /// The function returns an `Error` if any issues arise while reading from the object store or processing the data stream. -/// async fn find_first_newline( object_store: &Arc, location: &Path, diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs index 0f31eb7caf41..08e5b6a5df83 100644 --- a/datafusion/datasource/src/url.rs +++ b/datafusion/datasource/src/url.rs @@ -385,7 +385,6 @@ const GLOB_START_CHARS: [char; 3] = ['?', '*', '[']; /// /// Path delimiters are determined using [`std::path::is_separator`] which /// permits `/` as a path delimiter even on Windows platforms. -/// #[cfg(not(target_arch = "wasm32"))] fn split_glob_expression(path: &str) -> Option<(&str, &str)> { let mut last_separator = 0; diff --git a/datafusion/datasource/src/write/mod.rs b/datafusion/datasource/src/write/mod.rs index 3694568682a5..85832f81bc18 100644 --- a/datafusion/datasource/src/write/mod.rs +++ b/datafusion/datasource/src/write/mod.rs @@ -162,7 +162,11 @@ impl ObjectWriterBuilder { /// # let object_store = Arc::new(InMemory::new()); /// let mut builder = ObjectWriterBuilder::new(compression_type, &location, object_store); /// builder.set_buffer_size(Some(20 * 1024 * 1024)); //20 MiB - /// assert_eq!(builder.get_buffer_size(), Some(20 * 1024 * 1024), "Internal error: Builder buffer size doesn't match"); + /// assert_eq!( + /// builder.get_buffer_size(), + /// Some(20 * 1024 * 1024), + /// "Internal error: Builder buffer size doesn't match" + /// ); /// ``` pub fn set_buffer_size(&mut self, buffer_size: Option) { self.buffer_size = buffer_size; @@ -182,7 +186,11 @@ impl ObjectWriterBuilder { /// # let object_store = Arc::new(InMemory::new()); /// let builder = ObjectWriterBuilder::new(compression_type, &location, object_store) /// .with_buffer_size(Some(20 * 1024 * 1024)); //20 MiB - /// assert_eq!(builder.get_buffer_size(), Some(20 * 1024 * 1024), "Internal error: Builder buffer size doesn't match"); + /// assert_eq!( + /// builder.get_buffer_size(), + /// Some(20 * 1024 * 1024), + /// "Internal error: Builder buffer size doesn't match" + /// ); /// ``` pub fn with_buffer_size(mut self, buffer_size: Option) -> Self { self.buffer_size = buffer_size; From 97523e045920ff4f80d80fe883592f9c05630a99 Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Wed, 29 Oct 2025 16:49:38 +0000 Subject: [PATCH 044/157] Insta for enforce_distrubution (easy ones) (#18248) - part of https://github.com/apache/datafusion/issues/15791 All easy cases from https://github.com/apache/datafusion/pull/18185 (that are nicely-ish displayed in git diff). Note on preserving comments: if it was note about what should happen (or what will be tested), it's placed on top of the snapshot. If that's something that comments part of the plan, I put it below the plan --- .../enforce_distribution.rs | 2361 +++++++++-------- 1 file changed, 1261 insertions(+), 1100 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index 63111f43806b..db011c4be43a 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -66,9 +66,52 @@ use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr}; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion_physical_plan::union::UnionExec; use datafusion_physical_plan::{ - get_plan_string, DisplayAs, DisplayFormatType, ExecutionPlanProperties, + displayable, get_plan_string, DisplayAs, DisplayFormatType, ExecutionPlanProperties, PlanProperties, Statistics, }; +use insta::Settings; + +/// Helper function to replace only the first occurrence of a regex pattern in a plan +/// Returns (captured_group_1, modified_string) +fn hide_first( + plan: &dyn ExecutionPlan, + regex: &str, + replacement: &str, +) -> (String, String) { + let plan_str = displayable(plan).indent(true).to_string(); + let pattern = regex::Regex::new(regex).unwrap(); + + if let Some(captures) = pattern.captures(&plan_str) { + let full_match = captures.get(0).unwrap(); + let captured_value = captures + .get(1) + .map(|m| m.as_str().to_string()) + .unwrap_or_default(); + let pos = full_match.start(); + let end_pos = full_match.end(); + let mut result = String::with_capacity(plan_str.len()); + result.push_str(&plan_str[..pos]); + result.push_str(replacement); + result.push_str(&plan_str[end_pos..]); + (captured_value, result) + } else { + (String::new(), plan_str) + } +} + +macro_rules! assert_plan { + ($plan: expr, @ $expected:literal) => { + insta::assert_snapshot!( + displayable($plan.as_ref()).indent(true).to_string(), + @ $expected + ) + }; + ($plan: expr, $another_plan: expr) => { + let plan1 = displayable($plan.as_ref()).indent(true).to_string(); + let plan2 = displayable($another_plan.as_ref()).indent(true).to_string(); + assert_eq!(plan1, plan2); + } +} /// Models operators like BoundedWindowExec that require an input /// ordering but is easy to construct @@ -352,22 +395,6 @@ fn ensure_distribution_helper( ensure_distribution(distribution_context, &config).map(|item| item.data.plan) } -/// Test whether plan matches with expected plan -macro_rules! plans_matches_expected { - ($EXPECTED_LINES: expr, $PLAN: expr) => { - let physical_plan = $PLAN; - let actual = get_plan_string(&physical_plan); - - let expected_plan_lines: Vec<&str> = $EXPECTED_LINES - .iter().map(|s| *s).collect(); - - assert_eq!( - expected_plan_lines, actual, - "\n**Original Plan Mismatch\n\nexpected:\n\n{expected_plan_lines:#?}\nactual:\n\n{actual:#?}\n\n" - ); - } -} - fn test_suite_default_config_options() -> ConfigOptions { let mut config = ConfigOptions::new(); @@ -442,6 +469,7 @@ impl TestConfig { self } + // This be deleted in https://github.com/apache/datafusion/pull/18185 /// Perform a series of runs using the current [`TestConfig`], /// assert the expected plan result, /// and return the result plan (for potential subsequent runs). @@ -517,20 +545,79 @@ impl TestConfig { Ok(optimized) } -} -macro_rules! assert_plan_txt { - ($EXPECTED_LINES: expr, $PLAN: expr) => { - let expected_lines: Vec<&str> = $EXPECTED_LINES.iter().map(|s| *s).collect(); - // Now format correctly - let actual_lines = get_plan_string(&$PLAN); + /// Perform a series of runs using the current [`TestConfig`], + /// assert the expected plan result, + /// and return the result plan (for potential subsequent runs). + fn try_to_plan( + &self, + plan: Arc, + optimizers_to_run: &[Run], + ) -> Result> { + // Add the ancillary output requirements operator at the start: + let optimizer = OutputRequirements::new_add_mode(); + let mut optimized = optimizer.optimize(plan.clone(), &self.config)?; - assert_eq!( - &expected_lines, &actual_lines, - "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - expected_lines, actual_lines - ); - }; + // This file has 2 rules that use tree node, apply these rules to original plan consecutively + // After these operations tree nodes should be in a consistent state. + // This code block makes sure that these rules doesn't violate tree node integrity. + { + let adjusted = if self.config.optimizer.top_down_join_key_reordering { + // Run adjust_input_keys_ordering rule + let plan_requirements = + PlanWithKeyRequirements::new_default(plan.clone()); + let adjusted = plan_requirements + .transform_down(adjust_input_keys_ordering) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. + adjusted.plan + } else { + // Run reorder_join_keys_to_inputs rule + plan.clone() + .transform_up(|plan| { + Ok(Transformed::yes(reorder_join_keys_to_inputs(plan)?)) + }) + .data()? + }; + + // Then run ensure_distribution rule + DistributionContext::new_default(adjusted) + .transform_up(|distribution_context| { + ensure_distribution(distribution_context, &self.config) + }) + .data() + .and_then(check_integrity)?; + // TODO: End state payloads will be checked here. + } + + for run in optimizers_to_run { + optimized = match run { + Run::Distribution => { + let optimizer = EnforceDistribution::new(); + optimizer.optimize(optimized, &self.config)? + } + Run::Sorting => { + let optimizer = EnforceSorting::new(); + optimizer.optimize(optimized, &self.config)? + } + }; + } + + // Remove the ancillary output requirements operator when done: + let optimizer = OutputRequirements::new_remove_mode(); + let optimized = optimizer.optimize(optimized, &self.config)?; + + Ok(optimized) + } + + fn to_plan( + &self, + plan: Arc, + optimizers_to_run: &[Run], + ) -> Arc { + self.try_to_plan(plan, optimizers_to_run).unwrap() + } } #[test] @@ -556,6 +643,8 @@ fn multi_hash_joins() -> Result<()> { JoinType::RightAnti, ]; + let settings = Settings::clone_current(); + // Join on (a == b1) let join_on = vec![( Arc::new(Column::new_with_schema("a", &schema()).unwrap()) as _, @@ -564,11 +653,17 @@ fn multi_hash_joins() -> Result<()> { for join_type in join_types { let join = hash_join_exec(left.clone(), right.clone(), &join_on, &join_type); - let join_plan = |shift| -> String { - format!("{}HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(a@0, b1@1)]", " ".repeat(shift)) - }; - let join_plan_indent2 = join_plan(2); - let join_plan_indent4 = join_plan(4); + + let mut settings = settings.clone(); + settings.add_filter( + // join_type={} replace with join_type=... to avoid snapshot name issue + format!("join_type={join_type}").as_str(), + "join_type=...", + ); + + insta::allow_duplicates! { + settings.bind( || { + match join_type { JoinType::Inner @@ -589,50 +684,58 @@ fn multi_hash_joins() -> Result<()> { &top_join_on, &join_type, ); - let top_join_plan = - format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(a@0, c@2)]"); - let expected = match join_type { + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + + match join_type { // Should include 3 RepartitionExecs - JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + JoinType::Inner | JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => { + + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, c@2)] + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + }, // Should include 4 RepartitionExecs - _ => vec![ - top_join_plan.as_str(), - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - &join_plan_indent4, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + _ => { + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, c@2)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + }, }; - let test_config = TestConfig::default(); - test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected, top_join, &SORT_DISTRIB_DISTRIB)?; + + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); } JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => {} } + + match join_type { JoinType::Inner | JoinType::Left @@ -650,55 +753,64 @@ fn multi_hash_joins() -> Result<()> { let top_join = hash_join_exec(join, parquet_exec(), &top_join_on, &join_type); - let top_join_plan = match join_type { - JoinType::RightSemi | JoinType::RightAnti => - format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(b1@1, c@2)]"), - _ => - format!("HashJoinExec: mode=Partitioned, join_type={join_type}, on=[(b1@6, c@2)]"), - }; - let expected = match join_type { + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + + match join_type { // Should include 3 RepartitionExecs - JoinType::Inner | JoinType::Right | JoinType::RightSemi | JoinType::RightAnti => - vec![ - top_join_plan.as_str(), - &join_plan_indent2, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + JoinType::Inner | JoinType::Right => { + assert_plan!(parquet_exec(), @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet"); + }, + // Should include 3 RepartitionExecs but have a different "on" + JoinType::RightSemi | JoinType::RightAnti => { + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(b1@1, c@2)] + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + + } + // Should include 4 RepartitionExecs - _ => - vec![ - top_join_plan.as_str(), - " RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10", - &join_plan_indent4, - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ], + _ => { + assert_plan!(plan_distrib, @r" + HashJoinExec: mode=Partitioned, join_type=..., on=[(b1@6, c@2)] + RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10 + HashJoinExec: mode=Partitioned, join_type=..., on=[(a@0, b1@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + + }, }; - let test_config = TestConfig::default(); - test_config.run(&expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected, top_join, &SORT_DISTRIB_DISTRIB)?; + + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); } JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => {} } + + }); + } } Ok(()) @@ -737,23 +849,27 @@ fn multi_joins_after_alias() -> Result<()> { ); // Output partition need to respect the Alias and should not introduce additional RepartitionExec - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, c@2)]", - " ProjectionExec: expr=[a@0 as a1, a@0 as a2]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, c@2)] + ProjectionExec: expr=[a@0 as a1, a@0 as a2] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); // Join on (a2 == c) let top_join_on = vec![( @@ -764,23 +880,27 @@ fn multi_joins_after_alias() -> Result<()> { let top_join = hash_join_exec(projection, right, &top_join_on, &JoinType::Inner); // Output partition need to respect the Alias and should not introduce additional RepartitionExec - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a2@1, c@2)]", - " ProjectionExec: expr=[a@0 as a1, a@0 as a2]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a2@1, c@2)] + ProjectionExec: expr=[a@0 as a1, a@0 as a2] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -816,26 +936,29 @@ fn multi_joins_after_multi_alias() -> Result<()> { // The Column 'a' has different meaning now after the two Projections // The original Output partition can not satisfy the Join requirements and need to add an additional RepartitionExec - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, c@2)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " ProjectionExec: expr=[c1@0 as a]", - " ProjectionExec: expr=[c@2 as c1]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, c@2)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + ProjectionExec: expr=[c1@0 as a] + ProjectionExec: expr=[c@2 as c1] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@1)] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -861,22 +984,26 @@ fn join_after_agg_alias() -> Result<()> { let join = hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner); // Only two RepartitionExecs added - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, a2@0)]", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]", - " RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a2], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a1@0, a2@0)] + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[] + RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a2], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -914,23 +1041,27 @@ fn hash_join_key_ordering() -> Result<()> { let join = hash_join_exec(left, right.clone(), &join_on, &JoinType::Inner); // Only two RepartitionExecs added - let expected = &[ - "HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b1@1, b@0), (a1@0, a@1)]", - " ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", - " AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b1@1, b@0), (a1@0, a@1)] + ProjectionExec: expr=[a1@1 as a1, b1@0 as b1] + AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[] + RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[] + RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1034,30 +1165,35 @@ fn multi_hash_join_key_ordering() -> Result<()> { Arc::new(FilterExec::try_new(predicate, top_join)?); // The bottom joins' join key ordering is adjusted based on the top join. And the top join should not introduce additional RepartitionExec - let expected = &[ - "FilterExec: c@6 > 1", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(B@2, b1@6), (C@3, c@2), (AA@1, a1@5)]", - " ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, filter_top_join.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, filter_top_join, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = + test_config.to_plan(filter_top_join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!( + plan_distrib, + @r" + FilterExec: c@6 > 1 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(B@2, b1@6), (C@3, c@2), (AA@1, a1@5)] + ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(b@1, b1@1), (c@2, c1@2), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([b@1, c@2, a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b1@1, c1@2, a1@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + " + ); + let plan_sort = test_config.to_plan(filter_top_join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1168,34 +1304,34 @@ fn reorder_join_keys_to_left_input() -> Result<()> { &top_join_on, &join_type, ); - let top_join_plan = - format!("HashJoinExec: mode=Partitioned, join_type={:?}, on=[(AA@1, a1@5), (B@2, b1@6), (C@3, c@2)]", &join_type); - let reordered = reorder_join_keys_to_inputs(top_join)?; + let reordered = reorder_join_keys_to_inputs(top_join).unwrap(); // The top joins' join key ordering is adjusted based on the children inputs. - let expected = &[ - top_join_plan.as_str(), - " ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1), (c@2, c1@2)]", - " RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([a1@0, b1@1, c1@2], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - - assert_plan_txt!(expected, reordered); + let (captured_join_type, modified_plan) = + hide_first(reordered.as_ref(), r"join_type=(\w+)", "join_type=..."); + assert_eq!(captured_join_type, join_type.to_string()); + + insta::allow_duplicates! {insta::assert_snapshot!(modified_plan, @r" +HashJoinExec: mode=Partitioned, join_type=..., on=[(AA@1, a1@5), (B@2, b1@6), (C@3, c@2)] + ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1), (c@2, c1@2)] + RepartitionExec: partitioning=Hash([a@0, b@1, c@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([a1@0, b1@1, c1@2], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +");} } Ok(()) @@ -1302,34 +1438,32 @@ fn reorder_join_keys_to_right_input() -> Result<()> { &top_join_on, &join_type, ); - let top_join_plan = - format!("HashJoinExec: mode=Partitioned, join_type={:?}, on=[(C@3, c@2), (B@2, b1@6), (AA@1, a1@5)]", &join_type); - let reordered = reorder_join_keys_to_inputs(top_join)?; + let reordered = reorder_join_keys_to_inputs(top_join).unwrap(); // The top joins' join key ordering is adjusted based on the children inputs. - let expected = &[ - top_join_plan.as_str(), - " ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C]", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1)]", - " RepartitionExec: partitioning=Hash([a@0, b@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([a1@0, b1@1], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)]", - " RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - - assert_plan_txt!(expected, reordered); + let (_, plan_str) = + hide_first(reordered.as_ref(), r"join_type=(\w+)", "join_type=..."); + insta::allow_duplicates! {insta::assert_snapshot!(plan_str, @r" +HashJoinExec: mode=Partitioned, join_type=..., on=[(C@3, c@2), (B@2, b1@6), (AA@1, a1@5)] + ProjectionExec: expr=[a@0 as A, a@0 as AA, b@1 as B, c@2 as C] + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, a1@0), (b@1, b1@1)] + RepartitionExec: partitioning=Hash([a@0, b@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([a1@0, b1@1], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@2, c1@2), (b@1, b1@1), (a@0, a1@0)] + RepartitionExec: partitioning=Hash([c@2, b@1, a@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([c1@2, b1@1, a1@0], 10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +");} } Ok(()) @@ -1670,52 +1804,52 @@ fn smj_join_key_ordering() -> Result<()> { // Test: run EnforceDistribution, then EnforceSort. // Only two RepartitionExecs added - let expected = &[ - "SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]", - " SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[true]", - " ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]", - " ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", - " AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[true]", - " ProjectionExec: expr=[a@1 as a2, b@0 as b2]", - " AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected, join.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(join.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, @r" +SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)] + SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a1@0 as a3, b1@1 as b3] + ProjectionExec: expr=[a1@1 as a1, b1@0 as b1] + AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[] + RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a@1 as a2, b@0 as b2] + AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[] + RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]", - " RepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b3@1 ASC, a3@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " ProjectionExec: expr=[a1@0 as a3, b1@1 as b3]", - " ProjectionExec: expr=[a1@1 as a1, b1@0 as b1]", - " AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " RepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b2@1 ASC, a2@0 ASC", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " ProjectionExec: expr=[a@1 as a2, b@0 as b2]", - " AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, join, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(join, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, @r" +SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)] + RepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b3@1 ASC, a3@0 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[b3@1 ASC, a3@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a1@0 as a3, b1@1 as b3] + ProjectionExec: expr=[a1@1 as a1, b1@0 as b1] + AggregateExec: mode=FinalPartitioned, gby=[b1@0 as b1, a1@1 as a1], aggr=[] + RepartitionExec: partitioning=Hash([b1@0, a1@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + RepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b2@1 ASC, a2@0 ASC + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[b2@1 ASC, a2@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a@1 as a2, b@0 as b2] + AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, a@1 as a], aggr=[] + RepartitionExec: partitioning=Hash([b@0, a@1], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[b@1 as b, a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -1744,13 +1878,14 @@ fn merge_does_not_need_sort() -> Result<()> { // // The optimizer should not add an additional SortExec as the // data is already sorted - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " CoalesceBatchesExec: target_batch_size=4096", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, exec.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(exec.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + CoalesceBatchesExec: target_batch_size=4096 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: // @@ -1758,13 +1893,14 @@ fn merge_does_not_need_sort() -> Result<()> { // (according to flag: PREFER_EXISTING_SORT) // hence in this case ordering lost during CoalescePartitionsExec and re-introduced with // SortExec at the top. - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " CoalesceBatchesExec: target_batch_size=4096", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, exec, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(exec, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + CoalesceBatchesExec: target_batch_size=4096 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); Ok(()) } @@ -1790,25 +1926,26 @@ fn union_to_interleave() -> Result<()> { aggregate_exec_with_alias(plan, vec![("a1".to_string(), "a2".to_string())]); // Only two RepartitionExecs added, no final RepartitionExec required - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]", - " AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]", - " InterleaveExec", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[] + AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[] + InterleaveExec + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1834,28 +1971,29 @@ fn union_not_to_interleave() -> Result<()> { aggregate_exec_with_alias(plan, vec![("a1".to_string(), "a2".to_string())]); // Only two RepartitionExecs added, no final RepartitionExec required - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[]", - " RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=20", - " AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[]", - " UnionExec", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]", - " RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - // TestConfig: Prefer existing union. let test_config = TestConfig::default().with_prefer_existing_union(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a2@0 as a2], aggr=[] + RepartitionExec: partitioning=Hash([a2@0], 10), input_partitions=20 + AggregateExec: mode=Partial, gby=[a1@0 as a2], aggr=[] + UnionExec + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[] + RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1865,17 +2003,18 @@ fn added_repartition_to_single_partition() -> Result<()> { let alias = vec![("a".to_string(), "a".to_string())]; let plan = aggregate_exec_with_alias(parquet_exec(), alias); - let expected = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(&expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1885,18 +2024,19 @@ fn repartition_deepest_node() -> Result<()> { let alias = vec![("a".to_string(), "a".to_string())]; let plan = aggregate_exec_with_alias(filter_exec(parquet_exec()), alias); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1905,19 +2045,20 @@ fn repartition_deepest_node() -> Result<()> { fn repartition_unsorted_limit() -> Result<()> { let plan = limit_exec(filter_exec(parquet_exec())); - let expected = &[ - "GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // nothing sorts the data, so the local limit doesn't require sorted data either - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + // nothing sorts the data, so the local limit doesn't require sorted data either + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1932,17 +2073,18 @@ fn repartition_sorted_limit() -> Result<()> { .into(); let plan = limit_exec(sort_exec(sort_key, parquet_exec())); - let expected = &[ - "GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // data is sorted so can't repartition here - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // data is sorted so can't repartition here + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1960,19 +2102,20 @@ fn repartition_sorted_limit_with_filter() -> Result<()> { sort_key, ); - let expected = &[ - "SortRequiredExec: [c@2 ASC]", - " FilterExec: c@2 = 0", - // We can use repartition here, ordering requirement by SortRequiredExec - // is still satisfied. - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortRequiredExec: [c@2 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // We can use repartition here, ordering requirement by SortRequiredExec + // is still satisfied. + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -1985,26 +2128,28 @@ fn repartition_ignores_limit() -> Result<()> { alias, ); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // repartition should happen prior to the filter to maximize parallelism - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // Expect no repartition to happen for local limit - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // repartition should happen prior to the filter to maximize parallelism + // Expect no repartition to happen for local limit (DataSourceExec) + + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2013,19 +2158,20 @@ fn repartition_ignores_limit() -> Result<()> { fn repartition_ignores_union() -> Result<()> { let plan = union_exec(vec![parquet_exec(); 5]); - let expected = &[ - "UnionExec", - // Expect no repartition of DataSourceExec - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Expect no repartition of DataSourceExec + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2041,15 +2187,15 @@ fn repartition_through_sort_preserving_merge() -> Result<()> { .into(); let plan = sort_preserving_merge_exec(sort_key, parquet_exec()); - // need resort as the data was not sorted correctly - let expected = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2068,24 +2214,25 @@ fn repartition_ignores_sort_preserving_merge() -> Result<()> { parquet_exec_multiple_sorted(vec![sort_key]), ); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort - // + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [c@2 ASC] + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); // should not sort (as the data was already sorted) // should not repartition, since increased parallelism is not beneficial for SortPReservingMerge - let expected = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); Ok(()) } @@ -2105,27 +2252,29 @@ fn repartition_ignores_sort_preserving_merge_with_union() -> Result<()> { ]); let plan = sort_preserving_merge_exec(sort_key, input); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort. + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [c@2 ASC] + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); // // should not repartition / sort (as the data was already sorted) - let expected = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; // test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); Ok(()) } @@ -2149,16 +2298,17 @@ fn repartition_does_not_destroy_sort() -> Result<()> { // TestConfig: Prefer existing sort. let test_config = TestConfig::default().with_prefer_existing_sort(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortRequiredExec: [d@3 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet +"); // during repartitioning ordering is preserved - let expected = &[ - "SortRequiredExec: [d@3 ASC]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet", - ]; - - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2187,22 +2337,25 @@ fn repartition_does_not_destroy_sort_more_complex() -> Result<()> { let input2 = filter_exec(parquet_exec()); let plan = union_exec(vec![input1, input2]); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +UnionExec + SortRequiredExec: [c@2 ASC] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // union input 1: no repartitioning + // union input 2: should repartition + // // should not repartition below the SortRequired as that // branch doesn't benefit from increased parallelism - let expected = &[ - "UnionExec", - // union input 1: no repartitioning - " SortRequiredExec: [c@2 ASC]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - // union input 2: should repartition - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2227,28 +2380,28 @@ fn repartition_transitively_with_projection() -> Result<()> { .into(); let plan = sort_preserving_merge_exec(sort_key, proj); - // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [sum@0 ASC]", - " SortExec: expr=[sum@0 ASC], preserve_partitioning=[true]", - // Since this projection is not trivial, increasing parallelism is beneficial - " ProjectionExec: expr=[a@0 + b@1 as sum]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [sum@0 ASC] + SortExec: expr=[sum@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a@0 + b@1 as sum] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[sum@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - // Since this projection is not trivial, increasing parallelism is beneficial - " ProjectionExec: expr=[a@0 + b@1 as sum]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[sum@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a@0 + b@1 as sum] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Since this projection is not trivial, increasing parallelism is beneficial Ok(()) } @@ -2275,16 +2428,18 @@ fn repartition_ignores_transitively_with_projection() -> Result<()> { sort_key, ); - let expected = &[ - "SortRequiredExec: [c@2 ASC]", - // Since this projection is trivial, increasing parallelism is not beneficial - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortRequiredExec: [c@2 ASC] + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + // Since this projection is trivial, increasing parallelism is not beneficial + + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2310,16 +2465,17 @@ fn repartition_transitively_past_sort_with_projection() -> Result<()> { ), ); - let expected = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // Since this projection is trivial, increasing parallelism is not beneficial - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Since this projection is trivial, increasing parallelism is not beneficial + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -2334,28 +2490,30 @@ fn repartition_transitively_past_sort_with_filter() -> Result<()> { .into(); let plan = sort_exec(sort_key, filter_exec(parquet_exec())); - // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + + // Expect repartition on the input to the sort (as it can benefit from additional parallelism) // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " FilterExec: c@2 = 0", - // Expect repartition on the input of the filter (as it can benefit from additional parallelism) - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Expect repartition on the input of the filter (as it can benefit from additional parallelism) Ok(()) } @@ -2381,30 +2539,32 @@ fn repartition_transitively_past_sort_with_projection_and_filter() -> Result<()> ), ); - // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - // Expect repartition on the input to the sort (as it can benefit from additional parallelism) - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " FilterExec: c@2 = 0", - // repartition is lowest down - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, plan.clone(), &DISTRIB_DISTRIB_SORT)?; + let plan_distrib = test_config.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + + // Expect repartition on the input to the sort (as it can benefit from additional parallelism) + // repartition is lowest down // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected_first_sort_enforcement, plan, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -2420,28 +2580,29 @@ fn parallelization_single_partition() -> Result<()> { .with_query_execution_partitions(2); // Test: with parquet - let expected_parquet = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - &expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2465,40 +2626,31 @@ fn parallelization_multiple_files() -> Result<()> { // The groups must have only contiguous ranges of rows from the same file // if any group has rows from multiple files, the data is no longer sorted destroyed // https://github.com/apache/datafusion/issues/8451 - let expected_with_3_target_partitions = [ - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={3 groups: [[x:0..50], [y:0..100], [x:50..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; let test_config_concurrency_3 = test_config.clone().with_query_execution_partitions(3); - test_config_concurrency_3.run( - &expected_with_3_target_partitions, - plan.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config_concurrency_3.run( - &expected_with_3_target_partitions, - plan.clone(), - &SORT_DISTRIB_DISTRIB, - )?; + let plan_3_distrib = + test_config_concurrency_3.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_3_distrib, + @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={3 groups: [[x:0..50], [y:0..100], [x:50..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); + let plan_3_sort = + test_config_concurrency_3.to_plan(plan.clone(), &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_3_distrib, plan_3_sort); - let expected_with_8_target_partitions = [ - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={8 groups: [[x:0..25], [y:0..25], [x:25..50], [y:25..50], [x:50..75], [y:50..75], [x:75..100], [y:75..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; let test_config_concurrency_8 = test_config.with_query_execution_partitions(8); - test_config_concurrency_8.run( - &expected_with_8_target_partitions, - plan.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config_concurrency_8.run( - &expected_with_8_target_partitions, - plan, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_8_distrib = + test_config_concurrency_8.to_plan(plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_8_distrib, + @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={8 groups: [[x:0..25], [y:0..25], [x:25..50], [y:25..50], [x:50..75], [y:50..75], [x:75..100], [y:75..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); + let plan_8_sort = test_config_concurrency_8.to_plan(plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_8_distrib, plan_8_sort); Ok(()) } @@ -2570,30 +2722,30 @@ fn parallelization_two_partitions() -> Result<()> { .with_prefer_repartition_file_scans(10); // Test: with parquet - let expected_parquet = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Plan already has two partitions - " DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - &expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Plan already has two partitions + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Plan already has two partitions - " DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // Plan already has two partitions + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2609,30 +2761,32 @@ fn parallelization_two_partitions_into_four() -> Result<()> { .with_prefer_repartition_file_scans(10); // Test: with parquet - let expected_parquet = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Multiple source files split across partitions - " DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - &expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(&expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + // Multiple source files split across partitions + assert_plan!(plan_parquet_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Multiple source files split across partitions + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = [ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - // Multiple source files split across partitions - " DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(&expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(&expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + // Multiple source files split across partitions + assert_plan!(plan_csv_distrib, @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // Multiple source files split across partitions + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2651,32 +2805,32 @@ fn parallelization_sorted_limit() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // data is sorted so can't repartition here - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // Doesn't parallelize for SortExec without preserve_partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, @r" +GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // data is sorted so can't repartition here + // Doesn't parallelize for SortExec without preserve_partitioning + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "GlobalLimitExec: skip=0, fetch=100", - " LocalLimitExec: fetch=100", - // data is sorted so can't repartition here - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // Doesn't parallelize for SortExec without preserve_partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // data is sorted so can't repartition here + // Doesn't parallelize for SortExec without preserve_partitioning + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2696,40 +2850,41 @@ fn parallelization_limit_with_filter() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // even though data is sorted, we can use repartition here. Since - // ordering is not used in subsequent stages anyway. - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // SortExec doesn't benefit from input partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + // even though data is sorted, we can use repartition here. Since + // ordering is not used in subsequent stages anyway. + // SortExec doesn't benefit from input partitioning + assert_plan!(plan_parquet_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // even though data is sorted, we can use repartition here. Since - // ordering is not used in subsequent stages anyway. - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // SortExec doesn't benefit from input partitioning - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + // even though data is sorted, we can use repartition here. Since + // ordering is not used in subsequent stages anyway. + // SortExec doesn't benefit from input partitioning + assert_plan!(plan_csv_distrib, + @r" +GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2747,48 +2902,49 @@ fn parallelization_ignores_limit() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // repartition should happen prior to the filter to maximize parallelism - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - // Limit doesn't benefit from input partitioning - no parallelism - " LocalLimitExec: fetch=100", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + "); + // repartition should happen prior to the filter to maximize parallelism + // Limit doesn't benefit from input partitioning - no parallelism + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - " CoalescePartitionsExec", - " LocalLimitExec: fetch=100", - " FilterExec: c@2 = 0", - // repartition should happen prior to the filter to maximize parallelism - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " GlobalLimitExec: skip=0, fetch=100", - // Limit doesn't benefit from input partitioning - no parallelism - " LocalLimitExec: fetch=100", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + CoalescePartitionsExec + LocalLimitExec: fetch=100 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + GlobalLimitExec: skip=0, fetch=100 + LocalLimitExec: fetch=100 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + "); + // repartition should happen prior to the filter to maximize parallelism + // Limit doesn't benefit from input partitioning - no parallelism + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2801,34 +2957,35 @@ fn parallelization_union_inputs() -> Result<()> { let test_config = TestConfig::default(); // Test: with parquet - let expected_parquet = &[ - "UnionExec", - // Union doesn't benefit from input partitioning - no parallelism - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" +UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + // Union doesn't benefit from input partitioning - no parallelism + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "UnionExec", - // Union doesn't benefit from input partitioning - no parallelism - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" +UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false +"); + // Union doesn't benefit from input partitioning - no parallelism + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2855,22 +3012,21 @@ fn parallelization_prior_to_sort_preserving_merge() -> Result<()> { // parallelization is not beneficial for SortPreservingMerge // Test: with parquet - let expected_parquet = &[ - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet" + ); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false" + ); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -2900,54 +3056,47 @@ fn parallelization_sort_preserving_merge_with_union() -> Result<()> { // should not sort (as the data was already sorted) // Test: with parquet - let expected_parquet = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - let expected_parquet_first_sort_enforcement = &[ - // no SPM - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // has coalesce - " CoalescePartitionsExec", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet_first_sort_enforcement, - plan_parquet, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" + SortPreservingMergeExec: [c@2 ASC] + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_sort, + @r" + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + // no SPM + // has coalesce // Test: with csv - let expected_csv = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - let expected_csv_first_sort_enforcement = &[ - // no SPM - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - // has coalesce - " CoalescePartitionsExec", - " UnionExec", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run( - expected_csv_first_sort_enforcement, - plan_csv.clone(), - &SORT_DISTRIB_DISTRIB, - )?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" + SortPreservingMergeExec: [c@2 ASC] + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + "); + let plan_csv_sort = test_config.to_plan(plan_csv.clone(), &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_sort, + @r" + SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + "); + // no SPM + // has coalesce Ok(()) } @@ -2975,24 +3124,25 @@ fn parallelization_does_not_benefit() -> Result<()> { // no parallelization, because SortRequiredExec doesn't benefit from increased parallelism // Test: with parquet - let expected_parquet = &[ - "SortRequiredExec: [c@2 ASC]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_parquet_distrib, + @r" + SortRequiredExec: [c@2 ASC] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); // Test: with csv - let expected_csv = &[ - "SortRequiredExec: [c@2 ASC]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_csv_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_csv_distrib, + @r" + SortRequiredExec: [c@2 ASC] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false + "); + let plan_csv_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_csv_distrib, plan_csv_sort); Ok(()) } @@ -3023,26 +3173,26 @@ fn parallelization_ignores_transitively_with_projection_parquet() -> Result<()> .into(); let plan_parquet = sort_preserving_merge_exec(sort_key_after_projection, proj_parquet); - let expected = &[ - "SortPreservingMergeExec: [c2@1 ASC]", - " ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - plans_matches_expected!(expected, &plan_parquet); + assert_plan!(plan_parquet, + @r" + SortPreservingMergeExec: [c2@1 ASC] + ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet + "); + + let test_config = TestConfig::default(); + let plan_parquet_distrib = + test_config.to_plan(plan_parquet.clone(), &DISTRIB_DISTRIB_SORT); // Expected Outcome: // data should not be repartitioned / resorted - let expected_parquet = &[ - "ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - let test_config = TestConfig::default(); - test_config.run( - expected_parquet, - plan_parquet.clone(), - &DISTRIB_DISTRIB_SORT, - )?; - test_config.run(expected_parquet, plan_parquet, &SORT_DISTRIB_DISTRIB)?; + assert_plan!(plan_parquet_distrib, + @r" +ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_parquet_sort = test_config.to_plan(plan_parquet, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_parquet_distrib, plan_parquet_sort); Ok(()) } @@ -3071,22 +3221,24 @@ fn parallelization_ignores_transitively_with_projection_csv() -> Result<()> { }] .into(); let plan_csv = sort_preserving_merge_exec(sort_key_after_projection, proj_csv); - let expected = &[ - "SortPreservingMergeExec: [c2@1 ASC]", - " ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - plans_matches_expected!(expected, &plan_csv); + assert_plan!(plan_csv, + @r" +SortPreservingMergeExec: [c2@1 ASC] + ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false +"); + let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(plan_csv.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +ProjectionExec: expr=[a@0 as a2, c@2 as c2] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false +"); // Expected Outcome: // data should not be repartitioned / resorted - let expected_csv = &[ - "ProjectionExec: expr=[a@0 as a2, c@2 as c2]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=csv, has_header=false", - ]; - let test_config = TestConfig::default(); - test_config.run(expected_csv, plan_csv.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected_csv, plan_csv, &SORT_DISTRIB_DISTRIB)?; + let plan_sort = test_config.to_plan(plan_csv, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3096,24 +3248,25 @@ fn remove_redundant_roundrobins() -> Result<()> { let input = parquet_exec(); let repartition = repartition_exec(repartition_exec(input)); let physical_plan = repartition_exec(filter_exec(repartition)); - let expected = &[ - "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(expected, &physical_plan); - - let expected = &[ - "FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; + assert_plan!(physical_plan, + @r" +RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3133,18 +3286,19 @@ fn remove_unnecessary_spm_after_filter() -> Result<()> { // TestConfig: Prefer existing sort. let test_config = TestConfig::default().with_prefer_existing_sort(); + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); // Expected Outcome: // Original plan expects its output to be ordered by c@2 ASC. // This is still satisfied since, after filter that column is constant. - let expected = &[ - "CoalescePartitionsExec", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + assert_plan!(plan_distrib, + @r" +CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3164,14 +3318,16 @@ fn preserve_ordering_through_repartition() -> Result<()> { // TestConfig: Prefer existing sort. let test_config = TestConfig::default().with_prefer_existing_sort(); - let expected = &[ - "SortPreservingMergeExec: [d@3 ASC]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=d@3 ASC", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [d@3 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=d@3 ASC + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[d@3 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3189,29 +3345,27 @@ fn do_not_preserve_ordering_through_repartition() -> Result<()> { let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - test_config.run( - expected_first_sort_enforcement, - physical_plan, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); Ok(()) } @@ -3227,17 +3381,18 @@ fn no_need_for_sort_after_filter() -> Result<()> { let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]); let physical_plan = sort_preserving_merge_exec(sort_key, filter_exec(input)); - let expected = &[ - // After CoalescePartitionsExec c is still constant. Hence c@2 ASC ordering is already satisfied. - "CoalescePartitionsExec", - // Since after this stage c is constant. c@2 ASC ordering is already satisfied. - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, @r" +CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); + // After CoalescePartitionsExec c is still constant. Hence c@2 ASC ordering is already satisfied. + // Since after this stage c is constant. c@2 ASC ordering is already satisfied. Ok(()) } @@ -3261,30 +3416,28 @@ fn do_not_preserve_ordering_through_repartition2() -> Result<()> { let test_config = TestConfig::default(); + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); // Test: run EnforceDistribution, then EnforceSort. - let expected = &[ - "SortPreservingMergeExec: [a@0 ASC]", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; + assert_plan!(plan_distrib, + @r" +SortPreservingMergeExec: [a@0 ASC] + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); // Test: result IS DIFFERENT, if EnforceSorting is run first: - let expected_first_sort_enforcement = &[ - "SortExec: expr=[a@0 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " SortExec: expr=[a@0 ASC], preserve_partitioning=[true]", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run( - expected_first_sort_enforcement, - physical_plan, - &SORT_DISTRIB_DISTRIB, - )?; + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_sort, + @r" +SortExec: expr=[a@0 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + SortExec: expr=[a@0 ASC], preserve_partitioning=[true] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); Ok(()) } @@ -3300,14 +3453,16 @@ fn do_not_preserve_ordering_through_repartition3() -> Result<()> { let input = parquet_exec_multiple_sorted(vec![sort_key]); let physical_plan = filter_exec(input); - let expected = &[ - "FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3322,30 +3477,27 @@ fn do_not_put_sort_when_input_is_invalid() -> Result<()> { .into(); let input = parquet_exec(); let physical_plan = sort_required_exec_with_req(filter_exec(input), sort_key); - let expected = &[ - // Ordering requirement of sort required exec is NOT satisfied - // by existing ordering at the source. - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - assert_plan_txt!(expected, physical_plan); - - let expected = &[ - "SortRequiredExec: [a@0 ASC]", - // Since at the start of the rule ordering requirement is not satisfied - // EnforceDistribution rule doesn't satisfy this requirement either. - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; + // Ordering requirement of sort required exec is NOT satisfied + // by existing ordering at the source. + assert_plan!(physical_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); let mut config = ConfigOptions::new(); config.execution.target_partitions = 10; config.optimizer.enable_round_robin_repartition = true; config.optimizer.prefer_existing_sort = false; let dist_plan = EnforceDistribution::new().optimize(physical_plan, &config)?; - assert_plan_txt!(expected, dist_plan); + // Since at the start of the rule ordering requirement is not satisfied + // EnforceDistribution rule doesn't satisfy this requirement either. + assert_plan!(dist_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -3361,29 +3513,26 @@ fn put_sort_when_input_is_valid() -> Result<()> { let input = parquet_exec_multiple_sorted(vec![sort_key.clone()]); let physical_plan = sort_required_exec_with_req(filter_exec(input), sort_key); - let expected = &[ - // Ordering requirement of sort required exec is satisfied - // by existing ordering at the source. - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; - assert_plan_txt!(expected, physical_plan); - - let expected = &[ - // Since at the start of the rule ordering requirement is satisfied - // EnforceDistribution rule satisfy this requirement also. - "SortRequiredExec: [a@0 ASC]", - " FilterExec: c@2 = 0", - " DataSourceExec: file_groups={10 groups: [[x:0..20], [y:0..20], [x:20..40], [y:20..40], [x:40..60], [y:40..60], [x:60..80], [y:60..80], [x:80..100], [y:80..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet", - ]; + // Ordering requirement of sort required exec is satisfied + // by existing ordering at the source. + assert_plan!(physical_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); let mut config = ConfigOptions::new(); config.execution.target_partitions = 10; config.optimizer.enable_round_robin_repartition = true; config.optimizer.prefer_existing_sort = false; let dist_plan = EnforceDistribution::new().optimize(physical_plan, &config)?; - assert_plan_txt!(expected, dist_plan); + // Since at the start of the rule ordering requirement is satisfied + // EnforceDistribution rule satisfy this requirement also. + assert_plan!(dist_plan, @r" +SortRequiredExec: [a@0 ASC] + FilterExec: c@2 = 0 + DataSourceExec: file_groups={10 groups: [[x:0..20], [y:0..20], [x:20..40], [y:20..40], [x:40..60], [y:40..60], [x:60..80], [y:60..80], [x:80..100], [y:80..100]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], file_type=parquet +"); Ok(()) } @@ -3404,13 +3553,15 @@ fn do_not_add_unnecessary_hash() -> Result<()> { // Make sure target partition number is 1. In this case hash repartition is unnecessary. let test_config = TestConfig::default().with_query_execution_partitions(1); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3432,19 +3583,21 @@ fn do_not_add_unnecessary_hash2() -> Result<()> { // Make sure target partition number is larger than 2 (e.g partition number at the source). let test_config = TestConfig::default().with_query_execution_partitions(4); - let expected = &[ - "AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - // Since hash requirements of this operator is satisfied. There shouldn't be - // a hash repartition here - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4", - " AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]", - " RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2", - " DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] + RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 + DataSourceExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], file_type=parquet +"); + // Since hash requirements of this operator is satisfied. There shouldn't be + // a hash repartition here + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3452,19 +3605,21 @@ fn do_not_add_unnecessary_hash2() -> Result<()> { #[test] fn optimize_away_unnecessary_repartition() -> Result<()> { let physical_plan = coalesce_partitions_exec(repartition_exec(parquet_exec())); - let expected = &[ - "CoalescePartitionsExec", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(expected, physical_plan.clone()); - - let expected = - &["DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet"]; + assert_plan!(physical_plan, + @r" +CoalescePartitionsExec + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3474,25 +3629,27 @@ fn optimize_away_unnecessary_repartition2() -> Result<()> { let physical_plan = filter_exec(repartition_exec(coalesce_partitions_exec( filter_exec(repartition_exec(parquet_exec())), ))); - let expected = &[ - "FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " CoalescePartitionsExec", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(expected, physical_plan.clone()); + assert_plan!(physical_plan, + @r" +FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + CoalescePartitionsExec + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); - let expected = &[ - "FilterExec: c@2 = 0", - " FilterExec: c@2 = 0", - " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; let test_config = TestConfig::default(); - test_config.run(expected, physical_plan.clone(), &DISTRIB_DISTRIB_SORT)?; - test_config.run(expected, physical_plan, &SORT_DISTRIB_DISTRIB)?; + let plan_distrib = test_config.to_plan(physical_plan.clone(), &DISTRIB_DISTRIB_SORT); + assert_plan!(plan_distrib, + @r" +FilterExec: c@2 = 0 + FilterExec: c@2 = 0 + RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1 + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); + let plan_sort = test_config.to_plan(physical_plan, &SORT_DISTRIB_DISTRIB); + assert_plan!(plan_distrib, plan_sort); Ok(()) } @@ -3512,27 +3669,31 @@ async fn test_distribute_sort_parquet() -> Result<()> { let physical_plan = sort_exec(sort_key, parquet_exec_with_stats(10000 * 8192)); // prior to optimization, this is the starting plan - let starting = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - plans_matches_expected!(starting, physical_plan.clone()); + assert_plan!(physical_plan, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=parquet +"); // what the enforce distribution run does. - let expected = &[ - "SortExec: expr=[c@2 ASC], preserve_partitioning=[false]", - " CoalescePartitionsExec", - " DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected, physical_plan.clone(), &[Run::Distribution])?; + let plan_distribution = + test_config.to_plan(physical_plan.clone(), &[Run::Distribution]); + assert_plan!(plan_distribution, + @r" +SortExec: expr=[c@2 ASC], preserve_partitioning=[false] + CoalescePartitionsExec + DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet +"); // what the sort parallelization (in enforce sorting), does after the enforce distribution changes - let expected = &[ - "SortPreservingMergeExec: [c@2 ASC]", - " SortExec: expr=[c@2 ASC], preserve_partitioning=[true]", - " DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet", - ]; - test_config.run(expected, physical_plan, &[Run::Distribution, Run::Sorting])?; + let plan_both = + test_config.to_plan(physical_plan, &[Run::Distribution, Run::Sorting]); + assert_plan!(plan_both, + @r" +SortPreservingMergeExec: [c@2 ASC] + SortExec: expr=[c@2 ASC], preserve_partitioning=[true] + DataSourceExec: file_groups={10 groups: [[x:0..8192000], [x:8192000..16384000], [x:16384000..24576000], [x:24576000..32768000], [x:32768000..40960000], [x:40960000..49152000], [x:49152000..57344000], [x:57344000..65536000], [x:65536000..73728000], [x:73728000..81920000]]}, projection=[a, b, c, d, e], file_type=parquet +"); Ok(()) } @@ -3557,12 +3718,12 @@ async fn test_distribute_sort_memtable() -> Result<()> { let physical_plan = dataframe.create_physical_plan().await?; // this is the final, optimized plan - let expected = &[ - "SortPreservingMergeExec: [id@0 ASC NULLS LAST]", - " SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true]", - " DataSourceExec: partitions=3, partition_sizes=[34, 33, 33]", - ]; - plans_matches_expected!(expected, physical_plan); + assert_plan!(physical_plan, + @r" +SortPreservingMergeExec: [id@0 ASC NULLS LAST] + SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] + DataSourceExec: partitions=3, partition_sizes=[34, 33, 33] +"); Ok(()) } From 9b33c92b2a7bc4996a7652b96db45e2aba688620 Mon Sep 17 00:00:00 2001 From: feniljain <49019259+feniljain@users.noreply.github.com> Date: Thu, 30 Oct 2025 00:21:08 +0530 Subject: [PATCH 045/157] feat: allow pushdown of dynamic filters having partition cols (#18172) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes #18171 ## Rationale for this change Included in the issue ## Are these changes tested? While I have tested this on local with a local TPCDS-like dataset, I would appreciate if someone provides me a good way to add tests for the same 😅 --------- Co-authored-by: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Co-authored-by: Claude --- .../examples/csv_json_opener.rs | 6 +- datafusion/core/src/datasource/mod.rs | 1 + .../src/datasource/physical_plan/parquet.rs | 4 +- datafusion/core/src/test_util/parquet.rs | 3 +- .../filter_pushdown/util.rs | 9 ++- datafusion/datasource-arrow/src/source.rs | 4 +- datafusion/datasource-avro/src/source.rs | 7 +- datafusion/datasource-csv/src/source.rs | 6 +- datafusion/datasource-json/src/source.rs | 3 +- datafusion/datasource-parquet/src/source.rs | 25 +++--- datafusion/datasource/src/file.rs | 4 +- datafusion/datasource/src/file_scan_config.rs | 5 +- datafusion/datasource/src/table_schema.rs | 24 ++++-- datafusion/datasource/src/test_util.rs | 5 +- .../test_files/parquet_filter_pushdown.slt | 21 +---- docs/source/library-user-guide/upgrading.md | 78 +++++++++++++++++++ 16 files changed, 150 insertions(+), 55 deletions(-) diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/csv_json_opener.rs index 8abed90238d4..ef2a3eaca0c8 100644 --- a/datafusion-examples/examples/csv_json_opener.rs +++ b/datafusion-examples/examples/csv_json_opener.rs @@ -31,7 +31,9 @@ use datafusion::{ test_util::aggr_test_schema, }; -use datafusion::datasource::physical_plan::FileScanConfigBuilder; +use datafusion::datasource::{ + physical_plan::FileScanConfigBuilder, table_schema::TableSchema, +}; use futures::StreamExt; use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore}; @@ -67,7 +69,7 @@ async fn csv_opener() -> Result<()> { let config = CsvSource::new(true, b',', b'"') .with_comment(Some(b'#')) - .with_schema(schema) + .with_schema(TableSchema::from_file_schema(schema)) .with_batch_size(8192) .with_projection(&scan_config); diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 94d651ddadd5..37b9663111a5 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -45,6 +45,7 @@ pub use datafusion_catalog::view; pub use datafusion_datasource::schema_adapter; pub use datafusion_datasource::sink; pub use datafusion_datasource::source; +pub use datafusion_datasource::table_schema; pub use datafusion_execution::object_store; pub use datafusion_physical_expr::create_ordering; diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 6df5cd7ac68f..18b855cec55e 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -54,7 +54,7 @@ mod tests { use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::file::FileSource; - use datafusion_datasource::{FileRange, PartitionedFile}; + use datafusion_datasource::{FileRange, PartitionedFile, TableSchema}; use datafusion_datasource_parquet::source::ParquetSource; use datafusion_datasource_parquet::{ DefaultParquetFileReaderFactory, ParquetFileReaderFactory, ParquetFormat, @@ -186,7 +186,7 @@ mod tests { source = source.with_bloom_filter_on_read(false); } - source.with_schema(Arc::clone(&table_schema)) + source.with_schema(TableSchema::new(Arc::clone(&table_schema), vec![])) } fn build_parquet_exec( diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index eb4c61c02524..203d9e97d2a8 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -40,6 +40,7 @@ use crate::prelude::{Expr, SessionConfig, SessionContext}; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::source::DataSourceExec; +use datafusion_datasource::TableSchema; use object_store::path::Path; use object_store::ObjectMeta; use parquet::arrow::ArrowWriter; @@ -186,7 +187,7 @@ impl TestParquetFile { ParquetSource::new(parquet_options) .with_predicate(Arc::clone(&physical_filter_expr)), ) - .with_schema(Arc::clone(&self.schema)); + .with_schema(TableSchema::from_file_schema(Arc::clone(&self.schema))); let config = scan_config_builder.with_source(source).build(); let parquet_exec = DataSourceExec::from_data_source(config); diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs index 54e8e7bf04da..7d8a9c7c2125 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs @@ -24,6 +24,7 @@ use datafusion_datasource::{ file_scan_config::FileScanConfigBuilder, file_stream::FileOpenFuture, file_stream::FileOpener, schema_adapter::DefaultSchemaAdapterFactory, schema_adapter::SchemaAdapterFactory, source::DataSourceExec, PartitionedFile, + TableSchema, }; use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_optimizer::PhysicalOptimizerRule; @@ -156,9 +157,13 @@ impl FileSource for TestSource { }) } - fn with_schema(&self, schema: SchemaRef) -> Arc { + fn with_schema(&self, schema: TableSchema) -> Arc { + assert!( + schema.table_partition_cols().is_empty(), + "TestSource does not support partition columns" + ); Arc::new(TestSource { - schema: Some(schema), + schema: Some(schema.file_schema().clone()), ..self.clone() }) } diff --git a/datafusion/datasource-arrow/src/source.rs b/datafusion/datasource-arrow/src/source.rs index f43f11880182..f254b7e3ff30 100644 --- a/datafusion/datasource-arrow/src/source.rs +++ b/datafusion/datasource-arrow/src/source.rs @@ -20,9 +20,9 @@ use std::sync::Arc; use datafusion_datasource::as_file_source; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; +use datafusion_datasource::TableSchema; use arrow::buffer::Buffer; -use arrow::datatypes::SchemaRef; use arrow_ipc::reader::FileDecoder; use datafusion_common::error::Result; use datafusion_common::{exec_datafusion_err, Statistics}; @@ -73,7 +73,7 @@ impl FileSource for ArrowSource { Arc::new(Self { ..self.clone() }) } - fn with_schema(&self, _schema: SchemaRef) -> Arc { + fn with_schema(&self, _schema: TableSchema) -> Arc { Arc::new(Self { ..self.clone() }) } fn with_statistics(&self, statistics: Statistics) -> Arc { diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index 0916222337b8..1ff73d2c3cc3 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -29,6 +29,7 @@ use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::file_stream::FileOpener; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; +use datafusion_datasource::TableSchema; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; @@ -84,11 +85,13 @@ impl FileSource for AvroSource { Arc::new(conf) } - fn with_schema(&self, schema: SchemaRef) -> Arc { + fn with_schema(&self, schema: TableSchema) -> Arc { let mut conf = self.clone(); - conf.schema = Some(schema); + // TableSchema may have partition columns, but AvroSource does not use partition columns or values atm + conf.schema = Some(Arc::clone(schema.file_schema())); Arc::new(conf) } + fn with_statistics(&self, statistics: Statistics) -> Arc { let mut conf = self.clone(); conf.projected_statistics = Some(statistics); diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs index 0445329d0653..0b18571e58bd 100644 --- a/datafusion/datasource-csv/src/source.rs +++ b/datafusion/datasource-csv/src/source.rs @@ -29,7 +29,7 @@ use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener}; use datafusion_datasource::{ as_file_source, calculate_range, FileRange, ListingTableUrl, PartitionedFile, - RangeCalculation, + RangeCalculation, TableSchema, }; use arrow::csv; @@ -258,9 +258,9 @@ impl FileSource for CsvSource { Arc::new(conf) } - fn with_schema(&self, schema: SchemaRef) -> Arc { + fn with_schema(&self, schema: TableSchema) -> Arc { let mut conf = self.clone(); - conf.file_schema = Some(schema); + conf.file_schema = Some(Arc::clone(schema.file_schema())); Arc::new(conf) } diff --git a/datafusion/datasource-json/src/source.rs b/datafusion/datasource-json/src/source.rs index 0b1eee1dac58..52ed0def03f1 100644 --- a/datafusion/datasource-json/src/source.rs +++ b/datafusion/datasource-json/src/source.rs @@ -32,6 +32,7 @@ use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener}; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; use datafusion_datasource::{ as_file_source, calculate_range, ListingTableUrl, PartitionedFile, RangeCalculation, + TableSchema, }; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; @@ -122,7 +123,7 @@ impl FileSource for JsonSource { Arc::new(conf) } - fn with_schema(&self, _schema: SchemaRef) -> Arc { + fn with_schema(&self, _schema: TableSchema) -> Arc { Arc::new(Self { ..self.clone() }) } fn with_statistics(&self, statistics: Statistics) -> Arc { diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index b7c29f615a19..edc9c65450ec 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -35,11 +35,12 @@ use datafusion_datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapterFactory, }; -use arrow::datatypes::{SchemaRef, TimeUnit}; +use arrow::datatypes::TimeUnit; use datafusion_common::config::TableParquetOptions; use datafusion_common::{DataFusionError, Statistics}; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::TableSchema; use datafusion_physical_expr::conjunction; use datafusion_physical_expr_adapter::DefaultPhysicalExprAdapterFactory; use datafusion_physical_expr_common::physical_expr::fmt_sql; @@ -274,7 +275,7 @@ pub struct ParquetSource { /// The schema of the file. /// In particular, this is the schema of the table without partition columns, /// *not* the physical schema of the file. - pub(crate) file_schema: Option, + pub(crate) table_schema: Option, /// Optional predicate for row filtering during parquet scan pub(crate) predicate: Option>, /// Optional user defined parquet file reader factory @@ -599,9 +600,9 @@ impl FileSource for ParquetSource { Arc::new(conf) } - fn with_schema(&self, schema: SchemaRef) -> Arc { + fn with_schema(&self, schema: TableSchema) -> Arc { Arc::new(Self { - file_schema: Some(schema), + table_schema: Some(schema), ..self.clone() }) } @@ -659,9 +660,10 @@ impl FileSource for ParquetSource { // the actual predicates are built in reference to the physical schema of // each file, which we do not have at this point and hence cannot use. // Instead we use the logical schema of the file (the table schema without partition columns). - if let (Some(file_schema), Some(predicate)) = - (&self.file_schema, &self.predicate) - { + if let (Some(file_schema), Some(predicate)) = ( + &self.table_schema.as_ref().map(|ts| ts.file_schema()), + &self.predicate, + ) { let predicate_creation_errors = Count::new(); if let (Some(pruning_predicate), _) = build_pruning_predicates( Some(predicate), @@ -698,7 +700,12 @@ impl FileSource for ParquetSource { filters: Vec>, config: &ConfigOptions, ) -> datafusion_common::Result>> { - let Some(file_schema) = self.file_schema.clone() else { + let Some(table_schema) = self + .table_schema + .as_ref() + .map(|ts| ts.table_schema()) + .cloned() + else { return Ok(FilterPushdownPropagation::with_parent_pushdown_result( vec![PushedDown::No; filters.len()], )); @@ -718,7 +725,7 @@ impl FileSource for ParquetSource { let filters: Vec = filters .into_iter() .map(|filter| { - if can_expr_be_pushed_down_with_schemas(&filter, &file_schema) { + if can_expr_be_pushed_down_with_schemas(&filter, &table_schema) { PushedDownPredicate::supported(filter) } else { PushedDownPredicate::unsupported(filter) diff --git a/datafusion/datasource/src/file.rs b/datafusion/datasource/src/file.rs index 7a2cf403fd8d..d6ade3b8b210 100644 --- a/datafusion/datasource/src/file.rs +++ b/datafusion/datasource/src/file.rs @@ -26,7 +26,7 @@ use crate::file_groups::FileGroupPartitioner; use crate::file_scan_config::FileScanConfig; use crate::file_stream::FileOpener; use crate::schema_adapter::SchemaAdapterFactory; -use arrow::datatypes::SchemaRef; +use crate::TableSchema; use datafusion_common::config::ConfigOptions; use datafusion_common::{not_impl_err, Result, Statistics}; use datafusion_physical_expr::{LexOrdering, PhysicalExpr}; @@ -64,7 +64,7 @@ pub trait FileSource: Send + Sync { /// Initialize new type with batch size configuration fn with_batch_size(&self, batch_size: usize) -> Arc; /// Initialize new instance with a new schema - fn with_schema(&self, schema: SchemaRef) -> Arc; + fn with_schema(&self, schema: TableSchema) -> Arc; /// Initialize new instance with projection information fn with_projection(&self, config: &FileScanConfig) -> Arc; /// Initialize new instance with projected statistics diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 072922eb8920..5847a8cf5e11 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -89,6 +89,7 @@ use log::{debug, warn}; /// # use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; /// # use datafusion_datasource::file_stream::FileOpener; /// # use datafusion_datasource::source::DataSourceExec; +/// # use datafusion_datasource::table_schema::TableSchema; /// # use datafusion_execution::object_store::ObjectStoreUrl; /// # use datafusion_physical_plan::ExecutionPlan; /// # use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; @@ -109,7 +110,7 @@ use log::{debug, warn}; /// # fn create_file_opener(&self, _: Arc, _: &FileScanConfig, _: usize) -> Arc { unimplemented!() } /// # fn as_any(&self) -> &dyn Any { self } /// # fn with_batch_size(&self, _: usize) -> Arc { unimplemented!() } -/// # fn with_schema(&self, _: SchemaRef) -> Arc { Arc::new(self.clone()) as Arc } +/// # fn with_schema(&self, _: TableSchema) -> Arc { Arc::new(self.clone()) as Arc } /// # fn with_projection(&self, _: &FileScanConfig) -> Arc { unimplemented!() } /// # fn with_statistics(&self, statistics: Statistics) -> Arc { Arc::new(Self {projected_statistics: Some(statistics), schema_adapter_factory: self.schema_adapter_factory.clone()} ) } /// # fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() } @@ -470,7 +471,7 @@ impl FileScanConfigBuilder { let file_source = file_source .with_statistics(statistics.clone()) - .with_schema(Arc::clone(table_schema.file_schema())); + .with_schema(table_schema.clone()); let file_compression_type = file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); let new_lines_in_values = new_lines_in_values.unwrap_or(false); diff --git a/datafusion/datasource/src/table_schema.rs b/datafusion/datasource/src/table_schema.rs index 863c123e3b1d..8002df4a99df 100644 --- a/datafusion/datasource/src/table_schema.rs +++ b/datafusion/datasource/src/table_schema.rs @@ -85,6 +85,11 @@ impl TableSchema { /// The table schema is automatically computed by appending the partition columns /// to the file schema. /// + /// You should prefer calling this method over + /// chaining [`TableSchema::from_file_schema`] and [`TableSchema::with_table_partition_cols`] + /// if you have both the file schema and partition columns available at construction time + /// since it avoids re-computing the table schema. + /// /// # Arguments /// /// * `file_schema` - Schema of the data files (without partition columns) @@ -121,18 +126,21 @@ impl TableSchema { } } - /// Create a new TableSchema from a file schema with no partition columns. + /// Create a new TableSchema with no partition columns. + /// + /// You should prefer calling [`TableSchema::new`] if you have partition columns at + /// construction time since it avoids re-computing the table schema. pub fn from_file_schema(file_schema: SchemaRef) -> Self { Self::new(file_schema, vec![]) } - /// Set the table partition columns and rebuild the table schema. - pub fn with_table_partition_cols( - mut self, - table_partition_cols: Vec, - ) -> TableSchema { - self.table_partition_cols = table_partition_cols; - // Rebuild the table schema with the new partition columns + /// Add partition columns to an existing TableSchema, returning a new instance. + /// + /// You should prefer calling [`TableSchema::new`] instead of chaining [`TableSchema::from_file_schema`] + /// into [`TableSchema::with_table_partition_cols`] if you have partition columns at construction time + /// since it avoids re-computing the table schema. + pub fn with_table_partition_cols(mut self, partition_cols: Vec) -> Self { + self.table_partition_cols = partition_cols; let mut builder = SchemaBuilder::from(self.file_schema.as_ref()); builder.extend(self.table_partition_cols.iter().cloned()); self.table_schema = Arc::new(builder.finish()); diff --git a/datafusion/datasource/src/test_util.rs b/datafusion/datasource/src/test_util.rs index f0aff1fa62b7..feb704af9913 100644 --- a/datafusion/datasource/src/test_util.rs +++ b/datafusion/datasource/src/test_util.rs @@ -22,7 +22,8 @@ use crate::{ use std::sync::Arc; -use arrow::datatypes::{Schema, SchemaRef}; +use crate::TableSchema; +use arrow::datatypes::Schema; use datafusion_common::{Result, Statistics}; use datafusion_physical_expr::{expressions::Column, PhysicalExpr}; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; @@ -66,7 +67,7 @@ impl FileSource for MockSource { Arc::new(Self { ..self.clone() }) } - fn with_schema(&self, _schema: SchemaRef) -> Arc { + fn with_schema(&self, _schema: TableSchema) -> Arc { Arc::new(Self { ..self.clone() }) } diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index 6dc2c264aeb8..e4676ae5332d 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -474,10 +474,7 @@ EXPLAIN select * from t_pushdown where part != val logical_plan 01)Filter: t_pushdown.val != t_pushdown.part 02)--TableScan: t_pushdown projection=[val, part], partial_filters=[t_pushdown.val != t_pushdown.part] -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: val@0 != part@1 -03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet +physical_plan DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != part@1 # If we reference only a partition column it gets evaluated during the listing phase query TT @@ -505,11 +502,7 @@ EXPLAIN select * from t_pushdown where val != 'd' AND val != 'c' AND part = 'a' logical_plan 01)Filter: t_pushdown.val != Utf8View("d") AND t_pushdown.val != Utf8View("c") AND t_pushdown.val != t_pushdown.part 02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val != Utf8View("d"), t_pushdown.val != Utf8View("c"), t_pushdown.val != t_pushdown.part] -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: val@0 != part@1 -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != d AND val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != d OR d != val_max@1) AND val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c, d)] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != d AND val@0 != c AND val@0 != part@1, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != d OR d != val_max@1) AND val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c, d)] # The order of filters should not matter query TT @@ -518,10 +511,7 @@ EXPLAIN select val, part from t_pushdown where part = 'a' AND part = val; logical_plan 01)Filter: t_pushdown.val = t_pushdown.part 02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val = t_pushdown.part] -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: val@0 = part@1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 = part@1 query TT select val, part from t_pushdown where part = 'a' AND part = val; @@ -534,10 +524,7 @@ EXPLAIN select val, part from t_pushdown where part = val AND part = 'a'; logical_plan 01)Filter: t_pushdown.val = t_pushdown.part 02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val = t_pushdown.part] -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: val@0 = part@1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 = part@1 query TT select val, part from t_pushdown where part = val AND part = 'a'; diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index c568b8b28e1f..f34b8b2a5cf0 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -182,6 +182,84 @@ let indices = projection_exprs.column_indices(); _execution plan_ of the query. With this release, `DESCRIBE query` now outputs the computed _schema_ of the query, consistent with the behavior of `DESCRIBE table_name`. +### Introduction of `TableSchema` and changes to `FileSource::with_schema()` method + +A new `TableSchema` struct has been introduced in the `datafusion-datasource` crate to better manage table schemas with partition columns. This struct helps distinguish between: + +- **File schema**: The schema of actual data files on disk +- **Partition columns**: Columns derived from directory structure (e.g., Hive-style partitioning) +- **Table schema**: The complete schema combining both file and partition columns + +As part of this change, the `FileSource::with_schema()` method signature has changed from accepting a `SchemaRef` to accepting a `TableSchema`. + +**Who is affected:** + +- Users who have implemented custom `FileSource` implementations will need to update their code +- Users who only use built-in file sources (Parquet, CSV, JSON, AVRO, Arrow) are not affected + +**Migration guide for custom `FileSource` implementations:** + +```diff + use datafusion_datasource::file::FileSource; +-use arrow::datatypes::SchemaRef; ++use datafusion_datasource::TableSchema; + + impl FileSource for MyCustomSource { +- fn with_schema(&self, schema: SchemaRef) -> Arc { ++ fn with_schema(&self, schema: TableSchema) -> Arc { + Arc::new(Self { +- schema: Some(schema), ++ // Use schema.file_schema() to get the file schema without partition columns ++ schema: Some(Arc::clone(schema.file_schema())), + ..self.clone() + }) + } + } +``` + +For implementations that need access to partition columns: + +```rust,ignore +fn with_schema(&self, schema: TableSchema) -> Arc { + Arc::new(Self { + file_schema: Arc::clone(schema.file_schema()), + partition_cols: schema.table_partition_cols().clone(), + table_schema: Arc::clone(schema.table_schema()), + ..self.clone() + }) +} +``` + +**Note**: Most `FileSource` implementations only need to store the file schema (without partition columns), as shown in the first example. The second pattern of storing all three schema components is typically only needed for advanced use cases where you need access to different schema representations for different operations (e.g., ParquetSource uses the file schema for building pruning predicates but needs the table schema for filter pushdown logic). + +**Using `TableSchema` directly:** + +If you're constructing a `FileScanConfig` or working with table schemas and partition columns, you can now use `TableSchema`: + +```rust +use datafusion_datasource::TableSchema; +use arrow::datatypes::{Schema, Field, DataType}; +use std::sync::Arc; + +// Create a TableSchema with partition columns +let file_schema = Arc::new(Schema::new(vec![ + Field::new("user_id", DataType::Int64, false), + Field::new("amount", DataType::Float64, false), +])); + +let partition_cols = vec![ + Arc::new(Field::new("date", DataType::Utf8, false)), + Arc::new(Field::new("region", DataType::Utf8, false)), +]; + +let table_schema = TableSchema::new(file_schema, partition_cols); + +// Access different schema representations +let file_schema_ref = table_schema.file_schema(); // Schema without partition columns +let full_schema = table_schema.table_schema(); // Complete schema with partition columns +let partition_cols_ref = table_schema.table_partition_cols(); // Just the partition columns +``` + ## DataFusion `50.0.0` ### ListingTable automatically detects Hive Partitioned tables From 68c74d363cb84cebd8a42dca004d2d435b511ae8 Mon Sep 17 00:00:00 2001 From: Yu-Chuan Hung <86523891+CuteChuanChuan@users.noreply.github.com> Date: Thu, 30 Oct 2025 03:27:25 +0800 Subject: [PATCH 046/157] chore: Format examples in doc strings - macros and optmizer (#18354) ## Which issue does this PR close? Part of #16915 ## Rationale for this change Format code examples in documentation comments to improve readability and maintain consistent code style across the codebase. This is part of a multi-PR effort to format all doc comment examples and eventually enable CI checks to enforce this formatting. ## What changes are included in this PR? Run `cargo fmt -p -- --config format_code_in_doc_comments=true` for the following datasource-related crates: - `datafusion-macros` - `datafusion-optimizer` ## Are these changes tested? No testing needed - this is purely a formatting change with no functional modifications. ## Are there any user-facing changes? No - this only affects documentation formatting. --- datafusion/macros/src/user_doc.rs | 1 - datafusion/optimizer/src/push_down_filter.rs | 4 - datafusion/optimizer/src/push_down_limit.rs | 1 - .../simplify_expressions/expr_simplifier.rs | 110 +++++++++--------- .../src/simplify_expressions/unwrap_cast.rs | 1 - 5 files changed, 54 insertions(+), 63 deletions(-) diff --git a/datafusion/macros/src/user_doc.rs b/datafusion/macros/src/user_doc.rs index 71ce381ec431..58c2cc2b1b2a 100644 --- a/datafusion/macros/src/user_doc.rs +++ b/datafusion/macros/src/user_doc.rs @@ -61,7 +61,6 @@ use syn::{parse_macro_input, DeriveInput, LitStr}; /// } /// ``` /// will generate the following code -/// /// ```ignore /// pub struct ToDateFunc { /// signature: Signature, diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index a8251d669002..1c0790b3e3ac 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -562,7 +562,6 @@ fn push_down_join( /// /// * `on_filters` filters from the join ON clause that have not already been /// identified as join predicates -/// fn infer_join_predicates( join: &Join, predicates: &[Expr], @@ -649,7 +648,6 @@ impl InferredPredicates { /// * `predicates` the pushed down predicates /// /// * `inferred_predicates` the inferred results -/// fn infer_join_predicates_from_predicates( join_col_keys: &[(&Column, &Column)], predicates: &[Expr], @@ -673,7 +671,6 @@ fn infer_join_predicates_from_predicates( /// identified as join predicates /// /// * `inferred_predicates` the inferred results -/// fn infer_join_predicates_from_on_filters( join_col_keys: &[(&Column, &Column)], join_type: JoinType, @@ -719,7 +716,6 @@ fn infer_join_predicates_from_on_filters( /// /// * `ENABLE_RIGHT_TO_LEFT` indicates that the left table related predicate can /// be inferred from the right table related predicate -/// fn infer_join_predicates_impl< const ENABLE_LEFT_TO_RIGHT: bool, const ENABLE_RIGHT_TO_LEFT: bool, diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index c5a2e6578805..80d4a2de6679 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -30,7 +30,6 @@ use datafusion_expr::logical_plan::{Join, JoinType, Limit, LogicalPlan}; use datafusion_expr::{lit, FetchType, SkipType}; /// Optimization rule that tries to push down `LIMIT`. -/// //. It will push down through projection, limits (taking the smaller limit) #[derive(Default, Debug)] pub struct PushDownLimit {} diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 204ce14e37d8..85e9d9b6a0ed 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -69,23 +69,21 @@ use regex::Regex; /// /// For example: /// ``` -/// use arrow::datatypes::{Schema, Field, DataType}; -/// use datafusion_expr::{col, lit}; +/// use arrow::datatypes::{DataType, Field, Schema}; /// use datafusion_common::{DataFusionError, ToDFSchema}; /// use datafusion_expr::execution_props::ExecutionProps; /// use datafusion_expr::simplify::SimplifyContext; +/// use datafusion_expr::{col, lit}; /// use datafusion_optimizer::simplify_expressions::ExprSimplifier; /// /// // Create the schema -/// let schema = Schema::new(vec![ -/// Field::new("i", DataType::Int64, false), -/// ]) -/// .to_dfschema_ref().unwrap(); +/// let schema = Schema::new(vec![Field::new("i", DataType::Int64, false)]) +/// .to_dfschema_ref() +/// .unwrap(); /// /// // Create the simplifier /// let props = ExecutionProps::new(); -/// let context = SimplifyContext::new(&props) -/// .with_schema(schema); +/// let context = SimplifyContext::new(&props).with_schema(schema); /// let simplifier = ExprSimplifier::new(context); /// /// // Use the simplifier @@ -144,35 +142,35 @@ impl ExprSimplifier { /// /// ``` /// use arrow::datatypes::DataType; - /// use datafusion_expr::{col, lit, Expr}; + /// use datafusion_common::DFSchema; /// use datafusion_common::Result; /// use datafusion_expr::execution_props::ExecutionProps; /// use datafusion_expr::simplify::SimplifyContext; /// use datafusion_expr::simplify::SimplifyInfo; + /// use datafusion_expr::{col, lit, Expr}; /// use datafusion_optimizer::simplify_expressions::ExprSimplifier; - /// use datafusion_common::DFSchema; /// use std::sync::Arc; /// /// /// Simple implementation that provides `Simplifier` the information it needs /// /// See SimplifyContext for a structure that does this. /// #[derive(Default)] /// struct Info { - /// execution_props: ExecutionProps, + /// execution_props: ExecutionProps, /// }; /// /// impl SimplifyInfo for Info { - /// fn is_boolean_type(&self, expr: &Expr) -> Result { - /// Ok(false) - /// } - /// fn nullable(&self, expr: &Expr) -> Result { - /// Ok(true) - /// } - /// fn execution_props(&self) -> &ExecutionProps { - /// &self.execution_props - /// } - /// fn get_data_type(&self, expr: &Expr) -> Result { - /// Ok(DataType::Int32) - /// } + /// fn is_boolean_type(&self, expr: &Expr) -> Result { + /// Ok(false) + /// } + /// fn nullable(&self, expr: &Expr) -> Result { + /// Ok(true) + /// } + /// fn execution_props(&self) -> &ExecutionProps { + /// &self.execution_props + /// } + /// fn get_data_type(&self, expr: &Expr) -> Result { + /// Ok(DataType::Int32) + /// } /// } /// /// // Create the simplifier @@ -198,7 +196,6 @@ impl ExprSimplifier { /// optimizations. /// /// See [Self::simplify] for details and usage examples. - /// #[deprecated( since = "48.0.0", note = "Use `simplify_with_cycle_count_transformed` instead" @@ -222,7 +219,6 @@ impl ExprSimplifier { /// - The number of simplification cycles that were performed /// /// See [Self::simplify] for details and usage examples. - /// pub fn simplify_with_cycle_count_transformed( &self, mut expr: Expr, @@ -286,24 +282,24 @@ impl ExprSimplifier { /// /// ```rust /// use arrow::datatypes::{DataType, Field, Schema}; - /// use datafusion_expr::{col, lit, Expr}; - /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; /// use datafusion_common::{Result, ScalarValue, ToDFSchema}; /// use datafusion_expr::execution_props::ExecutionProps; + /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; /// use datafusion_expr::simplify::SimplifyContext; + /// use datafusion_expr::{col, lit, Expr}; /// use datafusion_optimizer::simplify_expressions::ExprSimplifier; /// /// let schema = Schema::new(vec![ - /// Field::new("x", DataType::Int64, false), - /// Field::new("y", DataType::UInt32, false), - /// Field::new("z", DataType::Int64, false), - /// ]) - /// .to_dfschema_ref().unwrap(); + /// Field::new("x", DataType::Int64, false), + /// Field::new("y", DataType::UInt32, false), + /// Field::new("z", DataType::Int64, false), + /// ]) + /// .to_dfschema_ref() + /// .unwrap(); /// /// // Create the simplifier /// let props = ExecutionProps::new(); - /// let context = SimplifyContext::new(&props) - /// .with_schema(schema); + /// let context = SimplifyContext::new(&props).with_schema(schema); /// /// // Expression: (x >= 3) AND (y + 2 < 10) AND (z > 5) /// let expr_x = col("x").gt_eq(lit(3_i64)); @@ -312,15 +308,18 @@ impl ExprSimplifier { /// let expr = expr_x.and(expr_y).and(expr_z.clone()); /// /// let guarantees = vec![ - /// // x ∈ [3, 5] - /// ( - /// col("x"), - /// NullableInterval::NotNull { - /// values: Interval::make(Some(3_i64), Some(5_i64)).unwrap() - /// } - /// ), - /// // y = 3 - /// (col("y"), NullableInterval::from(ScalarValue::UInt32(Some(3)))), + /// // x ∈ [3, 5] + /// ( + /// col("x"), + /// NullableInterval::NotNull { + /// values: Interval::make(Some(3_i64), Some(5_i64)).unwrap(), + /// }, + /// ), + /// // y = 3 + /// ( + /// col("y"), + /// NullableInterval::from(ScalarValue::UInt32(Some(3))), + /// ), /// ]; /// let simplifier = ExprSimplifier::new(context).with_guarantees(guarantees); /// let output = simplifier.simplify(expr).unwrap(); @@ -345,24 +344,24 @@ impl ExprSimplifier { /// /// ```rust /// use arrow::datatypes::{DataType, Field, Schema}; - /// use datafusion_expr::{col, lit, Expr}; - /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; /// use datafusion_common::{Result, ScalarValue, ToDFSchema}; /// use datafusion_expr::execution_props::ExecutionProps; + /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; /// use datafusion_expr::simplify::SimplifyContext; + /// use datafusion_expr::{col, lit, Expr}; /// use datafusion_optimizer::simplify_expressions::ExprSimplifier; /// /// let schema = Schema::new(vec![ - /// Field::new("a", DataType::Int64, false), - /// Field::new("b", DataType::Int64, false), - /// Field::new("c", DataType::Int64, false), - /// ]) - /// .to_dfschema_ref().unwrap(); + /// Field::new("a", DataType::Int64, false), + /// Field::new("b", DataType::Int64, false), + /// Field::new("c", DataType::Int64, false), + /// ]) + /// .to_dfschema_ref() + /// .unwrap(); /// /// // Create the simplifier /// let props = ExecutionProps::new(); - /// let context = SimplifyContext::new(&props) - /// .with_schema(schema); + /// let context = SimplifyContext::new(&props).with_schema(schema); /// let simplifier = ExprSimplifier::new(context); /// /// // Expression: a = c AND 1 = b @@ -376,9 +375,9 @@ impl ExprSimplifier { /// /// // If canonicalization is disabled, the expression is not changed /// let non_canonicalized = simplifier - /// .with_canonicalize(false) - /// .simplify(expr.clone()) - /// .unwrap(); + /// .with_canonicalize(false) + /// .simplify(expr.clone()) + /// .unwrap(); /// /// assert_eq!(non_canonicalized, expr); /// ``` @@ -437,7 +436,6 @@ impl ExprSimplifier { /// assert_eq!(simplified_expr.data, lit(true)); /// // Only 1 cycle was executed /// assert_eq!(count, 1); - /// /// ``` pub fn with_max_cycles(mut self, max_simplifier_cycles: u32) -> Self { self.max_simplifier_cycles = max_simplifier_cycles; diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs index 5286cbd7bdf6..b1f3b006e0cf 100644 --- a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs +++ b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs @@ -53,7 +53,6 @@ //! ```text //! c1 > INT32(10) //! ``` -//! use arrow::datatypes::DataType; use datafusion_common::{internal_err, tree_node::Transformed}; From bffabc7179a08966a0401415557599e7d5106389 Mon Sep 17 00:00:00 2001 From: Yu-Chuan Hung <86523891+CuteChuanChuan@users.noreply.github.com> Date: Thu, 30 Oct 2025 03:27:49 +0800 Subject: [PATCH 047/157] chore: Format examples in doc strings - proto, pruning, and session (#18358) ## Which issue does this PR close? Part of #16915 ## Rationale for this change Format code examples in documentation comments to improve readability and maintain consistent code style across the codebase. This is part of a multi-PR effort to format all doc comment examples and eventually enable CI checks to enforce this formatting. ## What changes are included in this PR? Run `cargo fmt -p -- --config format_code_in_doc_comments=true` for the following datasource-related crates: - `datafusion-proto` - `datafusion-proto-common` - `datafusion-pruning` - `datafusion-session` ## Are these changes tested? No testing needed - this is purely a formatting change with no functional modifications. ## Are there any user-facing changes? No - this only affects documentation formatting. --- datafusion/proto-common/src/lib.rs | 43 ++++++++++++--------- datafusion/proto/src/lib.rs | 14 +++---- datafusion/pruning/src/pruning_predicate.rs | 3 +- datafusion/session/src/session.rs | 9 +++-- 4 files changed, 38 insertions(+), 31 deletions(-) diff --git a/datafusion/proto-common/src/lib.rs b/datafusion/proto-common/src/lib.rs index 9efb234e3994..b0061168c5ce 100644 --- a/datafusion/proto-common/src/lib.rs +++ b/datafusion/proto-common/src/lib.rs @@ -62,28 +62,33 @@ //! # use datafusion_proto_common::protobuf_common; //! # use prost::Message; //! # fn main() -> Result<()>{ -//! // Create a new ScalarValue -//! let val = ScalarValue::UInt64(Some(3)); -//! let mut buffer = BytesMut::new(); -//! let protobuf: protobuf_common::ScalarValue = match val { -//! ScalarValue::UInt64(Some(val)) => { -//! protobuf_common::ScalarValue{value: Some(protobuf_common::scalar_value::Value::Uint64Value(val))} -//! } -//! _ => unreachable!(), -//! }; +//! // Create a new ScalarValue +//! let val = ScalarValue::UInt64(Some(3)); +//! let mut buffer = BytesMut::new(); +//! let protobuf: protobuf_common::ScalarValue = match val { +//! ScalarValue::UInt64(Some(val)) => protobuf_common::ScalarValue { +//! value: Some(protobuf_common::scalar_value::Value::Uint64Value(val)), +//! }, +//! _ => unreachable!(), +//! }; //! -//! protobuf.encode(&mut buffer) +//! protobuf +//! .encode(&mut buffer) //! .map_err(|e| plan_datafusion_err!("Error encoding protobuf as bytes: {e}"))?; -//! // Convert it to bytes (for sending over the network, etc.) -//! let bytes: Bytes = buffer.into(); +//! // Convert it to bytes (for sending over the network, etc.) +//! let bytes: Bytes = buffer.into(); //! -//! let protobuf = protobuf_common::ScalarValue::decode(bytes).map_err(|e| plan_datafusion_err!("Error decoding ScalarValue as protobuf: {e}"))?; -//! // Decode bytes from somewhere (over network, etc.) back to ScalarValue -//! let decoded_val: ScalarValue = match protobuf.value { -//! Some(protobuf_common::scalar_value::Value::Uint64Value(val)) => ScalarValue::UInt64(Some(val)), -//! _ => unreachable!(), -//! }; -//! assert_eq!(val, decoded_val); +//! let protobuf = protobuf_common::ScalarValue::decode(bytes).map_err(|e| { +//! plan_datafusion_err!("Error decoding ScalarValue as protobuf: {e}") +//! })?; +//! // Decode bytes from somewhere (over network, etc.) back to ScalarValue +//! let decoded_val: ScalarValue = match protobuf.value { +//! Some(protobuf_common::scalar_value::Value::Uint64Value(val)) => { +//! ScalarValue::UInt64(Some(val)) +//! } +//! _ => unreachable!(), +//! }; +//! assert_eq!(val, decoded_val); //! # Ok(()) //! # } //! ``` diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index b1590b9ad2aa..b16b12bc0516 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -64,15 +64,15 @@ //! # use datafusion_expr::{col, lit, Expr}; //! # use datafusion_proto::bytes::Serializeable; //! # fn main() -> Result<()>{ -//! // Create a new `Expr` a < 32 -//! let expr = col("a").lt(lit(5i32)); +//! // Create a new `Expr` a < 32 +//! let expr = col("a").lt(lit(5i32)); //! -//! // Convert it to bytes (for sending over the network, etc.) -//! let bytes = expr.to_bytes()?; +//! // Convert it to bytes (for sending over the network, etc.) +//! let bytes = expr.to_bytes()?; //! -//! // Decode bytes from somewhere (over network, etc.) back to Expr -//! let decoded_expr = Expr::from_bytes(&bytes)?; -//! assert_eq!(expr, decoded_expr); +//! // Decode bytes from somewhere (over network, etc.) back to Expr +//! let decoded_expr = Expr::from_bytes(&bytes)?; +//! assert_eq!(expr, decoded_expr); //! # Ok(()) //! # } //! ``` diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs index fa3454ce5644..380ada10df6e 100644 --- a/datafusion/pruning/src/pruning_predicate.rs +++ b/datafusion/pruning/src/pruning_predicate.rs @@ -882,7 +882,7 @@ impl From> for RequiredColumns { /// ```text /// ("s1", Min, Field:s1_min) /// ("s2", Max, field:s2_max) -///``` +/// ``` /// /// And the input statistics had /// ```text @@ -5108,7 +5108,6 @@ mod tests { /// /// `expected` is a vector of bools, where true means the row group should /// be kept, and false means it should be pruned. - /// // TODO refactor other tests to use this to reduce boiler plate fn prune_with_expr( expr: Expr, diff --git a/datafusion/session/src/session.rs b/datafusion/session/src/session.rs index de23dba491fd..fd033172f224 100644 --- a/datafusion/session/src/session.rs +++ b/datafusion/session/src/session.rs @@ -57,9 +57,12 @@ use std::sync::{Arc, Weak}; /// // Given a `Session` reference, get the concrete `SessionState` reference /// // Note: this may stop working in future versions, /// fn session_state_from_session(session: &dyn Session) -> Result<&SessionState> { -/// session.as_any() -/// .downcast_ref::() -/// .ok_or_else(|| exec_datafusion_err!("Failed to downcast Session to SessionState")) +/// session +/// .as_any() +/// .downcast_ref::() +/// .ok_or_else(|| { +/// exec_datafusion_err!("Failed to downcast Session to SessionState") +/// }) /// } /// ``` /// From 618e49695f745dcbcdd157c39a9381e5c6f9fed2 Mon Sep 17 00:00:00 2001 From: Yu-Chuan Hung <86523891+CuteChuanChuan@users.noreply.github.com> Date: Thu, 30 Oct 2025 03:59:55 +0800 Subject: [PATCH 048/157] chore: Format examples in doc strings - catalog listing (#18335) ## Which issue does this PR close? Part of #16915 ## Rationale for this change Format code examples in documentation comments to improve readability and maintain consistent code style across the codebase. This is part of a multi-PR effort to format all doc comment examples and eventually enable CI checks to enforce this formatting. ## What changes are included in this PR? Run `cargo fmt -p datafusion-catalog-listing -- --config format_code_in_doc_comments=true` ## Are these changes tested? No testing needed - this is purely a formatting change with no functional modifications. ## Are there any user-facing changes? No - this only affects documentation formatting. Co-authored-by: Andrew Lamb --- datafusion/catalog-listing/src/config.rs | 5 ++-- datafusion/catalog-listing/src/options.rs | 36 ++++++++--------------- 2 files changed, 14 insertions(+), 27 deletions(-) diff --git a/datafusion/catalog-listing/src/config.rs b/datafusion/catalog-listing/src/config.rs index 90f44de4fdbc..3370d2ea7553 100644 --- a/datafusion/catalog-listing/src/config.rs +++ b/datafusion/catalog-listing/src/config.rs @@ -53,7 +53,6 @@ pub enum SchemaSource { /// /// If not specified, a [`datafusion_datasource::schema_adapter::DefaultSchemaAdapterFactory`] /// will be used, which handles basic schema compatibility cases. -/// #[derive(Debug, Clone, Default)] pub struct ListingTableConfig { /// Paths on the `ObjectStore` for creating [`crate::ListingTable`]. @@ -160,8 +159,8 @@ impl ListingTableConfig { /// .with_file_extension(".parquet") /// .with_collect_stat(true); /// - /// let config = ListingTableConfig::new(table_paths) - /// .with_listing_options(options); // Configure file format and options + /// let config = ListingTableConfig::new(table_paths).with_listing_options(options); + /// // Configure file format and options /// ``` pub fn with_listing_options(self, listing_options: ListingOptions) -> Self { // Note: This method properly sets options, but be aware that downstream diff --git a/datafusion/catalog-listing/src/options.rs b/datafusion/catalog-listing/src/options.rs index 3cbf3573e951..7da8005f90ec 100644 --- a/datafusion/catalog-listing/src/options.rs +++ b/datafusion/catalog-listing/src/options.rs @@ -100,10 +100,8 @@ impl ListingOptions { /// # use datafusion_catalog_listing::ListingOptions; /// # use datafusion_datasource_parquet::file_format::ParquetFormat; /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_file_extension(".parquet"); + /// let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default())) + /// .with_file_extension(".parquet"); /// /// assert_eq!(listing_options.file_extension, ".parquet"); /// ``` @@ -123,10 +121,8 @@ impl ListingOptions { /// # use datafusion_datasource_parquet::file_format::ParquetFormat; /// /// let extension = Some(".parquet"); - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_file_extension_opt(extension); + /// let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default())) + /// .with_file_extension_opt(extension); /// /// assert_eq!(listing_options.file_extension, ".parquet"); /// ``` @@ -216,10 +212,8 @@ impl ListingOptions { /// # use datafusion_catalog_listing::ListingOptions; /// # use datafusion_datasource_parquet::file_format::ParquetFormat; /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_collect_stat(true); + /// let listing_options = + /// ListingOptions::new(Arc::new(ParquetFormat::default())).with_collect_stat(true); /// /// assert_eq!(listing_options.collect_stat, true); /// ``` @@ -235,10 +229,8 @@ impl ListingOptions { /// # use datafusion_catalog_listing::ListingOptions; /// # use datafusion_datasource_parquet::file_format::ParquetFormat; /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_target_partitions(8); + /// let listing_options = + /// ListingOptions::new(Arc::new(ParquetFormat::default())).with_target_partitions(8); /// /// assert_eq!(listing_options.target_partitions, 8); /// ``` @@ -255,15 +247,11 @@ impl ListingOptions { /// # use datafusion_catalog_listing::ListingOptions; /// # use datafusion_datasource_parquet::file_format::ParquetFormat; /// - /// // Tell datafusion that the files are sorted by column "a" - /// let file_sort_order = vec![vec![ - /// col("a").sort(true, true) - /// ]]; + /// // Tell datafusion that the files are sorted by column "a" + /// let file_sort_order = vec![vec![col("a").sort(true, true)]]; /// - /// let listing_options = ListingOptions::new(Arc::new( - /// ParquetFormat::default() - /// )) - /// .with_file_sort_order(file_sort_order.clone()); + /// let listing_options = ListingOptions::new(Arc::new(ParquetFormat::default())) + /// .with_file_sort_order(file_sort_order.clone()); /// /// assert_eq!(listing_options.file_sort_order, file_sort_order); /// ``` From 0ca4eafe10c846a1bf27492bb0c2972c765be9a0 Mon Sep 17 00:00:00 2001 From: r1b Date: Wed, 29 Oct 2025 16:41:39 -0400 Subject: [PATCH 049/157] feat: support temporary views in DataFrameTableProvider (#18158) ## Which issue does this PR close? - Closes #18026 ## Rationale for this change This makes it possible to support temporary views in datafusion-python without code duplication. Ref: https://github.com/apache/datafusion-python/pull/1267 ## What changes are included in this PR? - Add new public function `DataFrame::into_temporary_view` - Update `DataFrameTableProvider` with a new member that determines the `table_type` - Add a test ## Are these changes tested? Yes, see added test `register_temporary_table` ## Are there any user-facing changes? Yes, there is a new public function `DataFrame::into_temporary_view` --------- Co-authored-by: Andrew Lamb --- datafusion/core/src/dataframe/mod.rs | 17 +++++++++++++++-- datafusion/core/tests/dataframe/mod.rs | 19 ++++++++++++++++++- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 3186c5cb8230..b164b050da80 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1654,7 +1654,19 @@ impl DataFrame { /// Note: This discards the [`SessionState`] associated with this /// [`DataFrame`] in favour of the one passed to [`TableProvider::scan`] pub fn into_view(self) -> Arc { - Arc::new(DataFrameTableProvider { plan: self.plan }) + Arc::new(DataFrameTableProvider { + plan: self.plan, + table_type: TableType::Temporary, + }) + } + + /// See [`Self::into_view`]. The returned [`TableProvider`] will + /// create a transient table. + pub fn into_temporary_view(self) -> Arc { + Arc::new(DataFrameTableProvider { + plan: self.plan, + table_type: TableType::Temporary, + }) } /// Return a DataFrame with the explanation of its plan so far. @@ -2524,6 +2536,7 @@ macro_rules! dataframe { #[derive(Debug)] struct DataFrameTableProvider { plan: LogicalPlan, + table_type: TableType, } #[async_trait] @@ -2549,7 +2562,7 @@ impl TableProvider for DataFrameTableProvider { } fn table_type(&self) -> TableType { - TableType::View + self.table_type } async fn scan( diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 043f42b18c9f..e27a3414850a 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -77,7 +77,7 @@ use datafusion_expr::var_provider::{VarProvider, VarType}; use datafusion_expr::{ cast, col, create_udf, exists, in_subquery, lit, out_ref_col, placeholder, scalar_subquery, when, wildcard, Expr, ExprFunctionExt, ExprSchemable, LogicalPlan, - LogicalPlanBuilder, ScalarFunctionImplementation, SortExpr, WindowFrame, + LogicalPlanBuilder, ScalarFunctionImplementation, SortExpr, TableType, WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, }; use datafusion_physical_expr::aggregate::AggregateExprBuilder; @@ -1577,6 +1577,23 @@ async fn register_table() -> Result<()> { Ok(()) } +#[tokio::test] +async fn register_temporary_table() -> Result<()> { + let df = test_table().await?.select_columns(&["c1", "c12"])?; + let ctx = SessionContext::new(); + let df_impl = DataFrame::new(ctx.state(), df.logical_plan().clone()); + + let df_table_provider = df_impl.clone().into_temporary_view(); + + // check that we set the correct table_type + assert_eq!(df_table_provider.table_type(), TableType::Temporary); + + // check that we can register a dataframe as a temporary table + ctx.register_table("test_table", df_table_provider)?; + + Ok(()) +} + /// Compare the formatted string representation of two plans for equality fn assert_same_plan(plan1: &LogicalPlan, plan2: &LogicalPlan) { assert_eq!(format!("{plan1:?}"), format!("{plan2:?}")); From d21279d1bf5f450910ac7444deec0748b8c622e1 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Thu, 30 Oct 2025 13:55:21 +0800 Subject: [PATCH 050/157] feat: Better parquet row-group/page pruning metrics display (#18321) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes https://github.com/apache/datafusion/issues/18299 ## Rationale for this change See writeup in https://github.com/apache/datafusion/pull/18297 This PR is for the remaining metrics in `DataSourceExec` with parquet data source. ### Demo In datafusion-cli ``` CREATE EXTERNAL TABLE IF NOT EXISTS lineitem STORED AS parquet LOCATION '/Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem'; set datafusion.explain.analyze_level = summary; explain analyze select * from lineitem where l_orderkey = 3000000; +-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | plan_type | plan | +-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Plan with Metrics | CoalesceBatchesExec: target_batch_size=8192, metrics=[output_rows=5, elapsed_compute=48.677µs, output_bytes=1092.0 B] | | | FilterExec: l_orderkey@0 = 3000000, metrics=[output_rows=5, elapsed_compute=1.65872ms, output_bytes=530.8 KB] | | | DataSourceExec: file_groups={14 groups: [[Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-0.parquet:0..11525426], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-0.parquet:11525426..20311205, Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-1.parquet:0..2739647], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-1.parquet:2739647..14265073], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-1.parquet:14265073..20193593, Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-2.parquet:0..5596906], [Users/yongting/Code/datafusion/benchmarks/data/tpch_sf1/lineitem/part-2.parquet:5596906..17122332], ...]}, projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment], file_type=parquet, predicate=l_orderkey@0 = 3000000, pruning_predicate=l_orderkey_null_count@2 != row_count@3 AND l_orderkey_min@0 <= 3000000 AND 3000000 <= l_orderkey_max@1, required_guarantees=[l_orderkey in (3000000)], metrics=[output_rows=19813, elapsed_compute=14ns, output_bytes=5.7 MB, files_ranges_pruned_statistics=21 total → 3 matched, page_index_rows_pruned=748901 total → 19813 matched, row_groups_pruned_bloom_filter=1 total → 1 matched, row_groups_pruned_statistics=1 total → 1 matched, bytes_scanned=2147308, metadata_load_time=1.794289ms] | | | | +-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 1 row(s) fetched. Elapsed 0.081 seconds. ``` ## What changes are included in this PR? Update `row_groups_pruned_statistics`, `row_groups_pruned_bloom_filter`, `page_index_rows_pruned` with the new `PruningMetrics` metric type. The functional changes in the pr are in `datafusion/datasource-parquet/src/*`, it's only a few of lines, most changes are fixing tests. ## Are these changes tested? UTs are updated for the new metrics ## Are there any user-facing changes? No --------- Co-authored-by: Andrew Lamb --- .../examples/json_shredding.rs | 2 +- .../src/datasource/physical_plan/parquet.rs | 47 ++++++-- .../tests/parquet/external_access_plan.rs | 23 ++-- .../core/tests/parquet/filter_pushdown.rs | 31 ++++-- datafusion/core/tests/parquet/mod.rs | 90 +++++++++------- .../core/tests/parquet/row_group_pruning.rs | 102 +++++++++--------- datafusion/core/tests/sql/explain_analyze.rs | 14 +-- datafusion/datasource-parquet/src/metrics.rs | 41 ++----- datafusion/datasource-parquet/src/opener.rs | 22 ++++ .../datasource-parquet/src/page_filter.rs | 6 +- .../src/row_group_filter.rs | 27 ++++- datafusion/physical-plan/src/metrics/mod.rs | 2 +- docs/source/user-guide/explain-usage.md | 13 +-- 13 files changed, 254 insertions(+), 166 deletions(-) diff --git a/datafusion-examples/examples/json_shredding.rs b/datafusion-examples/examples/json_shredding.rs index a2e83bc9510a..5ef8b59b6420 100644 --- a/datafusion-examples/examples/json_shredding.rs +++ b/datafusion-examples/examples/json_shredding.rs @@ -142,7 +142,7 @@ async fn main() -> Result<()> { .await?; let plan = format!("{}", arrow::util::pretty::pretty_format_batches(&batches)?); println!("{plan}"); - assert_contains!(&plan, "row_groups_pruned_statistics=1"); + assert_contains!(&plan, "row_groups_pruned_statistics=2 total → 1 matched"); assert_contains!(&plan, "pushdown_rows_pruned=1"); Ok(()) diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 18b855cec55e..0ffb252a6605 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -65,7 +65,7 @@ mod tests { use datafusion_physical_plan::analyze::AnalyzeExec; use datafusion_physical_plan::collect; use datafusion_physical_plan::metrics::{ - ExecutionPlanMetricsSet, MetricType, MetricsSet, + ExecutionPlanMetricsSet, MetricType, MetricValue, MetricsSet, }; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; @@ -1175,8 +1175,10 @@ mod tests { // There are 4 rows pruned in each of batch2, batch3, and // batch4 for a total of 12. batch1 had no pruning as c2 was // filled in as null - assert_eq!(get_value(&metrics, "page_index_rows_pruned"), 12); - assert_eq!(get_value(&metrics, "page_index_rows_matched"), 6); + let (page_index_pruned, page_index_matched) = + get_pruning_metric(&metrics, "page_index_rows_pruned"); + assert_eq!(page_index_pruned, 12); + assert_eq!(page_index_matched, 6); } #[tokio::test] @@ -1776,8 +1778,10 @@ mod tests { | 5 | +-----+ "###); - assert_eq!(get_value(&metrics, "page_index_rows_pruned"), 4); - assert_eq!(get_value(&metrics, "page_index_rows_matched"), 2); + let (page_index_pruned, page_index_matched) = + get_pruning_metric(&metrics, "page_index_rows_pruned"); + assert_eq!(page_index_pruned, 4); + assert_eq!(page_index_matched, 2); assert!( get_value(&metrics, "page_index_eval_time") > 0, "no eval time in metrics: {metrics:#?}" @@ -1866,8 +1870,10 @@ mod tests { assert_contains!(&explain, "predicate=c1@0 != bar"); // there's a single row group, but we can check that it matched - // if no pruning was done this would be 0 instead of 1 - assert_contains!(&explain, "row_groups_matched_statistics=1"); + assert_contains!( + &explain, + "row_groups_pruned_statistics=1 total \u{2192} 1 matched" + ); // check the projection assert_contains!(&explain, "projection=[c1]"); @@ -1898,8 +1904,10 @@ mod tests { // When both matched and pruned are 0, it means that the pruning predicate // was not used at all. - assert_contains!(&explain, "row_groups_matched_statistics=0"); - assert_contains!(&explain, "row_groups_pruned_statistics=0"); + assert_contains!( + &explain, + "row_groups_pruned_statistics=1 total \u{2192} 1 matched" + ); // But pushdown predicate should be present assert_contains!( @@ -1952,7 +1960,12 @@ mod tests { /// Panics if no such metric. fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize { match metrics.sum_by_name(metric_name) { - Some(v) => v.as_usize(), + Some(v) => match v { + MetricValue::PruningMetrics { + pruning_metrics, .. + } => pruning_metrics.pruned(), + _ => v.as_usize(), + }, _ => { panic!( "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}" @@ -1961,6 +1974,20 @@ mod tests { } } + fn get_pruning_metric(metrics: &MetricsSet, metric_name: &str) -> (usize, usize) { + match metrics.sum_by_name(metric_name) { + Some(MetricValue::PruningMetrics { + pruning_metrics, .. + }) => (pruning_metrics.pruned(), pruning_metrics.matched()), + Some(_) => panic!( + "Metric '{metric_name}' is not a pruning metric in\n\n{metrics:#?}" + ), + None => panic!( + "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}" + ), + } + } + fn populate_csv_partitions( tmp_dir: &TempDir, partition_count: usize, diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs index a5397c5a397c..5135f956852c 100644 --- a/datafusion/core/tests/parquet/external_access_plan.rs +++ b/datafusion/core/tests/parquet/external_access_plan.rs @@ -33,7 +33,7 @@ use datafusion_common::{assert_contains, DFSchema}; use datafusion_datasource_parquet::{ParquetAccessPlan, RowGroupAccess}; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_expr::{col, lit, Expr}; -use datafusion_physical_plan::metrics::MetricsSet; +use datafusion_physical_plan::metrics::{MetricValue, MetricsSet}; use datafusion_physical_plan::ExecutionPlan; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; @@ -178,12 +178,21 @@ async fn plan_and_filter() { .unwrap(); // Verify that row group pruning still happens for just that group - let row_groups_pruned_statistics = - metric_value(&parquet_metrics, "row_groups_pruned_statistics").unwrap(); - assert_eq!( - row_groups_pruned_statistics, 1, - "metrics : {parquet_metrics:#?}", - ); + let row_groups_pruned_statistics = parquet_metrics + .sum_by_name("row_groups_pruned_statistics") + .unwrap(); + if let MetricValue::PruningMetrics { + pruning_metrics, .. + } = row_groups_pruned_statistics + { + assert_eq!( + pruning_metrics.pruned(), + 1, + "metrics : {parquet_metrics:#?}", + ); + } else { + unreachable!("metrics `row_groups_pruned_statistics` should exist") + } } #[tokio::test] diff --git a/datafusion/core/tests/parquet/filter_pushdown.rs b/datafusion/core/tests/parquet/filter_pushdown.rs index 226497fe5824..966f25161397 100644 --- a/datafusion/core/tests/parquet/filter_pushdown.rs +++ b/datafusion/core/tests/parquet/filter_pushdown.rs @@ -29,7 +29,7 @@ use arrow::compute::concat_batches; use arrow::record_batch::RecordBatch; use datafusion::physical_plan::collect; -use datafusion::physical_plan::metrics::MetricsSet; +use datafusion::physical_plan::metrics::{MetricValue, MetricsSet}; use datafusion::prelude::{ col, lit, lit_timestamp_nano, Expr, ParquetReadOptions, SessionContext, }; @@ -563,9 +563,9 @@ impl<'a> TestCase<'a> { } }; - let page_index_rows_pruned = get_value(&metrics, "page_index_rows_pruned"); + let (page_index_rows_pruned, page_index_rows_matched) = + get_pruning_metrics(&metrics, "page_index_rows_pruned"); println!(" page_index_rows_pruned: {page_index_rows_pruned}"); - let page_index_rows_matched = get_value(&metrics, "page_index_rows_matched"); println!(" page_index_rows_matched: {page_index_rows_matched}"); let page_index_filtering_expected = if scan_options.enable_page_index { @@ -592,14 +592,29 @@ impl<'a> TestCase<'a> { } } +fn get_pruning_metrics(metrics: &MetricsSet, metric_name: &str) -> (usize, usize) { + match metrics.sum_by_name(metric_name) { + Some(MetricValue::PruningMetrics { + pruning_metrics, .. + }) => (pruning_metrics.pruned(), pruning_metrics.matched()), + Some(_) => { + panic!("Metric '{metric_name}' is not a pruning metric in\n\n{metrics:#?}") + } + None => panic!( + "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}" + ), + } +} + fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize { match metrics.sum_by_name(metric_name) { + Some(MetricValue::PruningMetrics { + pruning_metrics, .. + }) => pruning_metrics.pruned(), Some(v) => v.as_usize(), - _ => { - panic!( - "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}" - ); - } + None => panic!( + "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}" + ), } } diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 34a48cdae374..097600e45ead 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -126,79 +126,97 @@ struct TestOutput { impl TestOutput { /// retrieve the value of the named metric, if any fn metric_value(&self, metric_name: &str) -> Option { + if let Some((pruned, _matched)) = self.pruning_metric(metric_name) { + return Some(pruned); + } + self.parquet_metrics .sum(|metric| metric.value().name() == metric_name) - .map(|v| v.as_usize()) - } - - /// The number of times the pruning predicate evaluation errors - fn predicate_evaluation_errors(&self) -> Option { - self.metric_value("predicate_evaluation_errors") - } - - /// The number of row_groups matched by bloom filter - fn row_groups_matched_bloom_filter(&self) -> Option { - self.metric_value("row_groups_matched_bloom_filter") - } - - /// The number of row_groups pruned by bloom filter - fn row_groups_pruned_bloom_filter(&self) -> Option { - self.metric_value("row_groups_pruned_bloom_filter") - } - - /// The number of row_groups matched by statistics - fn row_groups_matched_statistics(&self) -> Option { - self.metric_value("row_groups_matched_statistics") - } - - /// The number of row_groups pruned by statistics - fn row_groups_pruned_statistics(&self) -> Option { - self.metric_value("row_groups_pruned_statistics") + .map(|v| match v { + MetricValue::PruningMetrics { + pruning_metrics, .. + } => pruning_metrics.pruned(), + _ => v.as_usize(), + }) } - /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count, - /// for testing purpose, here it only aggregate the `pruned` count. - fn files_ranges_pruned_statistics(&self) -> Option { + fn pruning_metric(&self, metric_name: &str) -> Option<(usize, usize)> { let mut total_pruned = 0; + let mut total_matched = 0; let mut found = false; for metric in self.parquet_metrics.iter() { let metric = metric.as_ref(); - if metric.value().name() == "files_ranges_pruned_statistics" { + if metric.value().name() == metric_name { if let MetricValue::PruningMetrics { pruning_metrics, .. } = metric.value() { total_pruned += pruning_metrics.pruned(); + total_matched += pruning_metrics.matched(); found = true; } } } if found { - Some(total_pruned) + Some((total_pruned, total_matched)) } else { None } } + /// The number of times the pruning predicate evaluation errors + fn predicate_evaluation_errors(&self) -> Option { + self.metric_value("predicate_evaluation_errors") + } + + /// The number of row_groups pruned / matched by bloom filter + fn row_groups_bloom_filter(&self) -> Option<(usize, usize)> { + self.pruning_metric("row_groups_pruned_bloom_filter") + } + + /// The number of row_groups matched by statistics + fn row_groups_matched_statistics(&self) -> Option { + self.pruning_metric("row_groups_pruned_statistics") + .map(|(_pruned, matched)| matched) + } + + /// The number of row_groups pruned by statistics + fn row_groups_pruned_statistics(&self) -> Option { + self.pruning_metric("row_groups_pruned_statistics") + .map(|(pruned, _matched)| pruned) + } + + /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count, + /// for testing purpose, here it only aggregate the `pruned` count. + fn files_ranges_pruned_statistics(&self) -> Option { + self.pruning_metric("files_ranges_pruned_statistics") + .map(|(pruned, _matched)| pruned) + } + /// The number of row_groups matched by bloom filter or statistics + /// + /// E.g. starting with 10 row groups, statistics: 10 total -> 7 matched, bloom + /// filter: 7 total -> 3 matched, this function returns 3 for the final matched + /// count. fn row_groups_matched(&self) -> Option { - self.row_groups_matched_bloom_filter() - .zip(self.row_groups_matched_statistics()) - .map(|(a, b)| a + b) + self.row_groups_bloom_filter() + .map(|(_pruned, matched)| matched) } /// The number of row_groups pruned fn row_groups_pruned(&self) -> Option { - self.row_groups_pruned_bloom_filter() + self.row_groups_bloom_filter() + .map(|(pruned, _matched)| pruned) .zip(self.row_groups_pruned_statistics()) .map(|(a, b)| a + b) } /// The number of row pages pruned fn row_pages_pruned(&self) -> Option { - self.metric_value("page_index_rows_pruned") + self.pruning_metric("page_index_rows_pruned") + .map(|(pruned, _matched)| pruned) } fn description(&self) -> String { diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index 44409166d3ce..0411298055f2 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -133,13 +133,14 @@ impl RowGroupPruningTest { self.expected_files_pruned_by_statistics, "mismatched files_ranges_pruned_statistics", ); + let bloom_filter_metrics = output.row_groups_bloom_filter(); assert_eq!( - output.row_groups_matched_bloom_filter(), + bloom_filter_metrics.map(|(_pruned, matched)| matched), self.expected_row_group_matched_by_bloom_filter, "mismatched row_groups_matched_bloom_filter", ); assert_eq!( - output.row_groups_pruned_bloom_filter(), + bloom_filter_metrics.map(|(pruned, _matched)| pruned), self.expected_row_group_pruned_by_bloom_filter, "mismatched row_groups_pruned_bloom_filter", ); @@ -163,7 +164,7 @@ async fn prune_timestamps_nanos() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(10) .test_row_group_prune() @@ -181,7 +182,7 @@ async fn prune_timestamps_micros() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(10) .test_row_group_prune() @@ -199,7 +200,7 @@ async fn prune_timestamps_millis() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(10) .test_row_group_prune() @@ -217,7 +218,7 @@ async fn prune_timestamps_seconds() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(10) .test_row_group_prune() @@ -233,7 +234,7 @@ async fn prune_date32() { .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(3)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(1) .test_row_group_prune() @@ -262,8 +263,9 @@ async fn prune_date64() { println!("{}", output.description()); // This should prune out groups without error assert_eq!(output.predicate_evaluation_errors(), Some(0)); - assert_eq!(output.row_groups_matched(), Some(1)); - assert_eq!(output.row_groups_pruned(), Some(3)); + // 'dates' table has 4 row groups, and only the first one is matched by the predicate + assert_eq!(output.row_groups_matched_statistics(), Some(1)); + assert_eq!(output.row_groups_pruned_statistics(), Some(3)); assert_eq!(output.result_rows, 1, "{}", output.description()); } @@ -276,7 +278,7 @@ async fn prune_disabled() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(10) .test_row_group_prune() @@ -296,7 +298,7 @@ async fn prune_disabled() { // This should not prune any assert_eq!(output.predicate_evaluation_errors(), Some(0)); - assert_eq!(output.row_groups_matched(), Some(0)); + assert_eq!(output.row_groups_matched(), Some(4)); assert_eq!(output.row_groups_pruned(), Some(0)); assert_eq!( output.result_rows, @@ -322,7 +324,7 @@ macro_rules! int_tests { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(11) .test_row_group_prune() @@ -337,7 +339,7 @@ macro_rules! int_tests { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(11) .test_row_group_prune() @@ -381,10 +383,10 @@ macro_rules! int_tests { .with_scenario(Scenario::Int) .with_query(&format!("SELECT * FROM t where abs(i{}) = 1", $bits)) .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(3) .test_row_group_prune() @@ -397,10 +399,10 @@ macro_rules! int_tests { .with_scenario(Scenario::Int) .with_query(&format!("SELECT * FROM t where i{}+1 = 1", $bits)) .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(2) .test_row_group_prune() @@ -413,10 +415,10 @@ macro_rules! int_tests { .with_scenario(Scenario::Int) .with_query(&format!("SELECT * FROM t where 1-i{} > 1", $bits)) .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(9) .test_row_group_prune() @@ -498,7 +500,7 @@ macro_rules! uint_tests { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(11) .test_row_group_prune() @@ -542,10 +544,10 @@ macro_rules! uint_tests { .with_scenario(Scenario::UInt) .with_query(&format!("SELECT * FROM t where power(u{}, 2) = 25", $bits)) .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(2) .test_row_group_prune() @@ -558,10 +560,10 @@ macro_rules! uint_tests { .with_scenario(Scenario::UInt) .with_query(&format!("SELECT * FROM t where u{}+1 = 6", $bits)) .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(2) .test_row_group_prune() @@ -682,7 +684,7 @@ async fn prune_f64_lt() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(11) .test_row_group_prune() @@ -694,7 +696,7 @@ async fn prune_f64_lt() { .with_matched_by_stats(Some(3)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(3)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(11) .test_row_group_prune() @@ -712,7 +714,7 @@ async fn prune_f64_scalar_fun_and_gt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(2)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(1) .test_row_group_prune() @@ -726,10 +728,10 @@ async fn prune_f64_scalar_fun() { .with_scenario(Scenario::Float64) .with_query("SELECT * FROM t where abs(f-1) <= 0.000001") .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(1) .test_row_group_prune() @@ -743,10 +745,10 @@ async fn prune_f64_complex_expr() { .with_scenario(Scenario::Float64) .with_query("SELECT * FROM t where f+1 > 1.1") .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(9) .test_row_group_prune() @@ -760,10 +762,10 @@ async fn prune_f64_complex_expr_subtract() { .with_scenario(Scenario::Float64) .with_query("SELECT * FROM t where 1-f > 1") .with_expected_errors(Some(0)) - .with_matched_by_stats(Some(0)) + .with_matched_by_stats(Some(4)) .with_pruned_by_stats(Some(0)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(4)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(9) .test_row_group_prune() @@ -782,7 +784,7 @@ async fn prune_decimal_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(6) .test_row_group_prune() @@ -794,7 +796,7 @@ async fn prune_decimal_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(8) .test_row_group_prune() @@ -806,7 +808,7 @@ async fn prune_decimal_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(6) .test_row_group_prune() @@ -818,7 +820,7 @@ async fn prune_decimal_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(8) .test_row_group_prune() @@ -894,7 +896,7 @@ async fn prune_decimal_in_list() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(5) .test_row_group_prune() @@ -906,7 +908,7 @@ async fn prune_decimal_in_list() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(6) .test_row_group_prune() @@ -918,7 +920,7 @@ async fn prune_decimal_in_list() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(5) .test_row_group_prune() @@ -930,7 +932,7 @@ async fn prune_decimal_in_list() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(6) .test_row_group_prune() @@ -1064,7 +1066,7 @@ async fn prune_string_lt() { .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(2)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(3) .test_row_group_prune() @@ -1079,7 +1081,7 @@ async fn prune_string_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) // all backends from 'mixed' and 'all backends' .with_expected_rows(8) @@ -1172,7 +1174,7 @@ async fn prune_binary_lt() { .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(2)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(3) .test_row_group_prune() @@ -1187,7 +1189,7 @@ async fn prune_binary_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) // all backends from 'mixed' and 'all backends' .with_expected_rows(8) @@ -1279,7 +1281,7 @@ async fn prune_fixedsizebinary_lt() { .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(2)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .with_expected_rows(2) .test_row_group_prune() @@ -1294,7 +1296,7 @@ async fn prune_fixedsizebinary_lt() { .with_matched_by_stats(Some(2)) .with_pruned_by_stats(Some(1)) .with_pruned_files(Some(0)) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) // all backends from 'mixed' and 'all backends' .with_expected_rows(8) @@ -1362,7 +1364,7 @@ async fn test_row_group_with_null_values() { .with_pruned_files(Some(0)) .with_pruned_by_stats(Some(2)) .with_expected_rows(5) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .test_row_group_prune() .await; @@ -1376,7 +1378,7 @@ async fn test_row_group_with_null_values() { .with_pruned_files(Some(0)) .with_pruned_by_stats(Some(1)) .with_expected_rows(10) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(2)) .with_pruned_by_bloom_filter(Some(0)) .test_row_group_prune() .await; @@ -1390,7 +1392,7 @@ async fn test_row_group_with_null_values() { .with_pruned_files(Some(0)) .with_pruned_by_stats(Some(2)) .with_expected_rows(5) - .with_matched_by_bloom_filter(Some(0)) + .with_matched_by_bloom_filter(Some(1)) .with_pruned_by_bloom_filter(Some(0)) .test_row_group_prune() .await; diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index a7cc30a9484c..b3e8dac111be 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -852,10 +852,14 @@ async fn parquet_explain_analyze() { // should contain aggregated stats assert_contains!(&formatted, "output_rows=8"); - assert_contains!(&formatted, "row_groups_matched_bloom_filter=0"); - assert_contains!(&formatted, "row_groups_pruned_bloom_filter=0"); - assert_contains!(&formatted, "row_groups_matched_statistics=1"); - assert_contains!(&formatted, "row_groups_pruned_statistics=0"); + assert_contains!( + &formatted, + "row_groups_pruned_bloom_filter=1 total \u{2192} 1 matched" + ); + assert_contains!( + &formatted, + "row_groups_pruned_statistics=1 total \u{2192} 1 matched" + ); } // This test reproduces the behavior described in @@ -995,9 +999,7 @@ async fn parquet_explain_analyze_verbose() { .to_string(); // should contain the raw per file stats (with the label) - assert_contains!(&formatted, "row_groups_matched_bloom_filter{partition=0"); assert_contains!(&formatted, "row_groups_pruned_bloom_filter{partition=0"); - assert_contains!(&formatted, "row_groups_matched_statistics{partition=0"); assert_contains!(&formatted, "row_groups_pruned_statistics{partition=0"); } diff --git a/datafusion/datasource-parquet/src/metrics.rs b/datafusion/datasource-parquet/src/metrics.rs index 9d86a3ae9f2d..306bc9e6b013 100644 --- a/datafusion/datasource-parquet/src/metrics.rs +++ b/datafusion/datasource-parquet/src/metrics.rs @@ -44,14 +44,10 @@ pub struct ParquetFileMetrics { pub files_ranges_pruned_statistics: PruningMetrics, /// Number of times the predicate could not be evaluated pub predicate_evaluation_errors: Count, - /// Number of row groups whose bloom filters were checked and matched (not pruned) - pub row_groups_matched_bloom_filter: Count, - /// Number of row groups pruned by bloom filters - pub row_groups_pruned_bloom_filter: Count, - /// Number of row groups whose statistics were checked and matched (not pruned) - pub row_groups_matched_statistics: Count, - /// Number of row groups pruned by statistics - pub row_groups_pruned_statistics: Count, + /// Number of row groups whose bloom filters were checked, tracked with matched/pruned counts + pub row_groups_pruned_bloom_filter: PruningMetrics, + /// Number of row groups whose statistics were checked, tracked with matched/pruned counts + pub row_groups_pruned_statistics: PruningMetrics, /// Total number of bytes scanned pub bytes_scanned: Count, /// Total rows filtered out by predicates pushed into parquet scan @@ -64,10 +60,8 @@ pub struct ParquetFileMetrics { pub statistics_eval_time: Time, /// Total time spent evaluating row group Bloom Filters pub bloom_filter_eval_time: Time, - /// Total rows filtered out by parquet page index - pub page_index_rows_pruned: Count, - /// Total rows passed through the parquet page index - pub page_index_rows_matched: Count, + /// Total rows filtered or matched by parquet page index + pub page_index_rows_pruned: PruningMetrics, /// Total time spent evaluating parquet page index filters pub page_index_eval_time: Time, /// Total time spent reading and parsing metadata from the footer @@ -91,34 +85,20 @@ impl ParquetFileMetrics { // ----------------------- // 'summary' level metrics // ----------------------- - let row_groups_matched_bloom_filter = MetricBuilder::new(metrics) - .with_new_label("filename", filename.to_string()) - .with_type(MetricType::SUMMARY) - .counter("row_groups_matched_bloom_filter", partition); - let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) .with_type(MetricType::SUMMARY) - .counter("row_groups_pruned_bloom_filter", partition); - - let row_groups_matched_statistics = MetricBuilder::new(metrics) - .with_new_label("filename", filename.to_string()) - .with_type(MetricType::SUMMARY) - .counter("row_groups_matched_statistics", partition); + .pruning_metrics("row_groups_pruned_bloom_filter", partition); let row_groups_pruned_statistics = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) .with_type(MetricType::SUMMARY) - .counter("row_groups_pruned_statistics", partition); + .pruning_metrics("row_groups_pruned_statistics", partition); let page_index_rows_pruned = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) .with_type(MetricType::SUMMARY) - .counter("page_index_rows_pruned", partition); - let page_index_rows_matched = MetricBuilder::new(metrics) - .with_new_label("filename", filename.to_string()) - .with_type(MetricType::SUMMARY) - .counter("page_index_rows_matched", partition); + .pruning_metrics("page_index_rows_pruned", partition); let bytes_scanned = MetricBuilder::new(metrics) .with_new_label("filename", filename.to_string()) @@ -173,16 +153,13 @@ impl ParquetFileMetrics { Self { files_ranges_pruned_statistics, predicate_evaluation_errors, - row_groups_matched_bloom_filter, row_groups_pruned_bloom_filter, - row_groups_matched_statistics, row_groups_pruned_statistics, bytes_scanned, pushdown_rows_pruned, pushdown_rows_matched, row_pushdown_eval_time, page_index_rows_pruned, - page_index_rows_matched, statistics_eval_time, bloom_filter_eval_time, page_index_eval_time, diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 1c9b9feb9f50..2815b82f1d45 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -361,6 +361,7 @@ impl FileOpener for ParquetOpener { if let Some(range) = file_range.as_ref() { row_groups.prune_by_range(rg_metadata, range); } + // If there is a predicate that can be evaluated against the metadata if let Some(predicate) = predicate.as_ref() { if enable_row_group_stats_pruning { @@ -371,6 +372,12 @@ impl FileOpener for ParquetOpener { predicate, &file_metrics, ); + } else { + // Update metrics: statistics unavailable, so all row groups are + // matched (not pruned) + file_metrics + .row_groups_pruned_statistics + .add_matched(row_groups.remaining_row_group_count()); } if enable_bloom_filter && !row_groups.is_empty() { @@ -382,7 +389,22 @@ impl FileOpener for ParquetOpener { &file_metrics, ) .await; + } else { + // Update metrics: bloom filter unavailable, so all row groups are + // matched (not pruned) + file_metrics + .row_groups_pruned_bloom_filter + .add_matched(row_groups.remaining_row_group_count()); } + } else { + // Update metrics: no predicate, so all row groups are matched (not pruned) + let n_remaining_row_groups = row_groups.remaining_row_group_count(); + file_metrics + .row_groups_pruned_statistics + .add_matched(n_remaining_row_groups); + file_metrics + .row_groups_pruned_bloom_filter + .add_matched(n_remaining_row_groups); } let mut access_plan = row_groups.build(); diff --git a/datafusion/datasource-parquet/src/page_filter.rs b/datafusion/datasource-parquet/src/page_filter.rs index 82deedd406ce..2698b6c5fbb6 100644 --- a/datafusion/datasource-parquet/src/page_filter.rs +++ b/datafusion/datasource-parquet/src/page_filter.rs @@ -269,8 +269,10 @@ impl PagePruningAccessPlanFilter { } } - file_metrics.page_index_rows_pruned.add(total_skip); - file_metrics.page_index_rows_matched.add(total_select); + file_metrics.page_index_rows_pruned.add_pruned(total_skip); + file_metrics + .page_index_rows_pruned + .add_matched(total_select); access_plan } diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 51d50d780f10..2043f75070b5 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -60,6 +60,11 @@ impl RowGroupAccessPlanFilter { self.access_plan.is_empty() } + /// Return the number of row groups that are currently expected to be scanned + pub fn remaining_row_group_count(&self) -> usize { + self.access_plan.row_group_index_iter().count() + } + /// Returns the inner access plan pub fn build(self) -> ParquetAccessPlan { self.access_plan @@ -134,9 +139,9 @@ impl RowGroupAccessPlanFilter { for (idx, &value) in row_group_indexes.iter().zip(values.iter()) { if !value { self.access_plan.skip(*idx); - metrics.row_groups_pruned_statistics.add(1); + metrics.row_groups_pruned_statistics.add_pruned(1); } else { - metrics.row_groups_matched_statistics.add(1); + metrics.row_groups_pruned_statistics.add_matched(1); } } } @@ -215,10 +220,10 @@ impl RowGroupAccessPlanFilter { }; if prune_group { - metrics.row_groups_pruned_bloom_filter.add(1); + metrics.row_groups_pruned_bloom_filter.add_pruned(1); self.access_plan.skip(idx) - } else if !stats.column_sbbf.is_empty() { - metrics.row_groups_matched_bloom_filter.add(1); + } else { + metrics.row_groups_pruned_bloom_filter.add_matched(1); } } } @@ -494,6 +499,18 @@ mod tests { } } + #[test] + fn remaining_row_group_count_reports_non_skipped_groups() { + let mut filter = RowGroupAccessPlanFilter::new(ParquetAccessPlan::new_all(4)); + assert_eq!(filter.remaining_row_group_count(), 4); + + filter.access_plan.skip(1); + assert_eq!(filter.remaining_row_group_count(), 3); + + filter.access_plan.skip(3); + assert_eq!(filter.remaining_row_group_count(), 2); + } + #[test] fn row_group_pruning_predicate_simple_expr() { use datafusion_expr::{col, lit}; diff --git a/datafusion/physical-plan/src/metrics/mod.rs b/datafusion/physical-plan/src/metrics/mod.rs index e66db8f0c911..c9ddbe8f8983 100644 --- a/datafusion/physical-plan/src/metrics/mod.rs +++ b/datafusion/physical-plan/src/metrics/mod.rs @@ -304,7 +304,7 @@ impl MetricsSet { MetricValue::Gauge { name, .. } => name == metric_name, MetricValue::StartTimestamp(_) => false, MetricValue::EndTimestamp(_) => false, - MetricValue::PruningMetrics { .. } => false, + MetricValue::PruningMetrics { name, .. } => name == metric_name, MetricValue::Custom { .. } => false, }) } diff --git a/docs/source/user-guide/explain-usage.md b/docs/source/user-guide/explain-usage.md index 2288cae85dda..5a1184539c03 100644 --- a/docs/source/user-guide/explain-usage.md +++ b/docs/source/user-guide/explain-usage.md @@ -225,14 +225,11 @@ Again, reading from bottom up: When predicate pushdown is enabled, `DataSourceExec` with `ParquetSource` gains the following metrics: -- `page_index_rows_matched`: number of rows in pages that were tested by a page index filter, and passed -- `page_index_rows_pruned`: number of rows in pages that were tested by a page index filter, and did not pass -- `row_groups_matched_bloom_filter`: number of row groups that were tested by a Bloom Filter, and passed -- `row_groups_pruned_bloom_filter`: number of row groups that were tested by a Bloom Filter, and did not pass -- `row_groups_matched_statistics`: number of row groups that were tested by row group statistics (min and max value), and passed -- `row_groups_pruned_statistics`: number of row groups that were tested by row group statistics (min and max value), and did not pass -- `pushdown_rows_matched`: rows that were tested by any of the above filtered, and passed all of them (this should be minimum of `page_index_rows_matched`, `row_groups_pruned_bloom_filter`, and `row_groups_pruned_statistics`) -- `pushdown_rows_pruned`: rows that were tested by any of the above filtered, and did not pass one of them (this should be sum of `page_index_rows_matched`, `row_groups_pruned_bloom_filter`, and `row_groups_pruned_statistics`) +- `page_index_rows_pruned`: number of rows evaluated by page index filters. The metric reports both how many rows were considered in total and how many matched (were not pruned). +- `row_groups_pruned_bloom_filter`: number of row groups evaluated by Bloom Filters, reporting both total checked groups and groups that matched. +- `row_groups_pruned_statistics`: number of row groups evaluated by row-group statistics (min/max), reporting both total checked groups and groups that matched. +- `pushdown_rows_matched`: rows that were tested by any of the above filters, and passed all of them. +- `pushdown_rows_pruned`: rows that were tested by any of the above filters, and did not pass at least one of them. - `predicate_evaluation_errors`: number of times evaluating the filter expression failed (expected to be zero in normal operation) - `num_predicate_creation_errors`: number of errors creating predicates (expected to be zero in normal operation) - `bloom_filter_eval_time`: time spent parsing and evaluating Bloom Filters From a78242360bf06a7c0bafea9f40f975a13de90850 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Thu, 30 Oct 2025 14:26:41 +0800 Subject: [PATCH 051/157] ci: fix temporary file creation in tests and tighten CI check (#18374) ## Which issue does this PR close? - Closes #. ## Rationale for this change Temporary file is created during test, see reproducer in `datafusion-cli` ```sh yongting@Yongtings-MacBook-Pro-2 ~/C/datafusion (main=)> cargo test --package datafusion --test core_integration --all-features -- dataframe::test_copy_schema --exact --nocapture Compiling datafusion v50.3.0 (/Users/yongting/Code/datafusion/datafusion/core) Finished `test` profile [unoptimized + debuginfo] target(s) in 2.50s Running tests/core_integration.rs (target/debug/deps/core_integration-dee3896b38f536b2) running 1 test test dataframe::test_copy_schema ... ok test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 801 filtered out; finished in 0.02s yongting@Yongtings-MacBook-Pro-2 ~/C/datafusion (main=)> git status On branch main Your branch is up to date with 'upstream/main'. Untracked files: (use "git add ..." to include in what will be committed) "datafusion/core/\"/" nothing added to commit but untracked files present (use "git add" to track) ``` This PR fixes this test, and make CI stricter for similar temporary file creations. ## What changes are included in this PR? ## Are these changes tested? Yes, I have run the CI without fix in my local repo, it's failing as expected: https://github.com/2010YOUY01/arrow-datafusion/actions/runs/18913128118/job/53989721867 After the fix, the CI should be able to pass. ## Are there any user-facing changes? --- .github/workflows/rust.yml | 9 +++++++++ datafusion/core/tests/dataframe/mod.rs | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7019de0b7507..8a3563899fc6 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -302,6 +302,15 @@ jobs: --features serde,avro,json,backtrace,integration-tests,parquet_encryption - name: Verify Working Directory Clean run: git diff --exit-code + # Check no temporary directories created during test. + # `false/` folder is excuded for rust cache. + - name: Verify Working Directory Clean (No Untracked Files) + run: | + STATUS="$(git status --porcelain | sed -e '/^?? false\/$/d' -e '/^?? false$/d')" + if [ -n "$STATUS" ]; then + echo "$STATUS" + exit 1 + fi # datafusion-cli tests linux-test-datafusion-cli: diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index e27a3414850a..c35e3b2eb31b 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -6328,7 +6328,7 @@ async fn test_copy_schema() -> Result<()> { let target_path = tmp_dir.path().join("target.csv"); let query = format!( - "COPY source_table TO '{:?}' STORED AS csv", + "COPY source_table TO '{}' STORED AS csv", target_path.to_str().unwrap() ); From ff670d51e9c671bf1692376a447e4763a0643435 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 30 Oct 2025 03:00:23 -0400 Subject: [PATCH 052/157] Run extended tests when there are changes to datafusion-testing pin (#18310) ## Which issue does this PR close? ## Rationale for this change The `extended` tests rely on the checkout of datafusion-testing (that has the expected results for the sqlite sqllogictest suite) However, we don't currently run the extended tests when that pin is changed so we could potentially break CI on main if we don't catch changes in code review (this just happened to me in https://github.com/apache/datafusion/pull/17866#pullrequestreview-3385422253) ## What changes are included in this PR? 1. Run extended CI tests on changes to datafusion-testing ## Are these changes tested? I tested this in PR - https://github.com/apache/datafusion/pull/18311 - https://github.com/apache/datafusion/pull/18312 ## Are there any user-facing changes? No --- .github/workflows/extended.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml index 23bd66a0cf35..2472d2e0424f 100644 --- a/.github/workflows/extended.yml +++ b/.github/workflows/extended.yml @@ -44,6 +44,7 @@ on: - 'datafusion/physical*/**/*.rs' - 'datafusion/expr*/**/*.rs' - 'datafusion/optimizer/**/*.rs' + - 'datafusion-testing' workflow_dispatch: inputs: pr_number: From 6c852a4d2511b994cb0d5b6acc09591372e9b533 Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Thu, 30 Oct 2025 18:59:53 +1100 Subject: [PATCH 053/157] Introduce `expr_fields` to `AccumulatorArgs` to hold input argument fields (#18100) ## Which issue does this PR close? - Closes #16997 - Part of #11725 - Supersedes #17085 ## Rationale for this change When reviewing #17085 I was very confused by the fix suggested, and tried to understand why `AccumulatorArgs` didn't have easy access to `Field`s of its input expressions, as compared to scalar/window functions which do. Introducing this new field should make it easier for users to grab datatype, metadata, nullability of their input expressions for aggregate functions. ## What changes are included in this PR? Add a slice of `FieldRef` to `AccumulatorArgs` so users don't need to compute the input expression fields themselves via using schema. This addresses #16997 as it was confusing to have only the schema available as there are valid (?) cases where the schema is empty (such as literal only input). This fix differs from #17085 in that it doesn't special case for when there is literal only input; it leaves the physical `schema` provided to `AccumulatorArgs` untouched but provides a more ergonomic (and less confusing) API for users to retrieve `Field`s of their input arguments. - I'm still not sure if the schema being empty for literal only inputs is correct or not, so this might be considered a side step. If we could remove `schema` entirely from `AccumulatorArgs` maybe we wouldn't need to worry about this, but see my comment for why that wasn't done in this PR ## Are these changes tested? Existing unit tests. ## Are there any user-facing changes? Yes, new field to `AccumulatorArgs` which is publicly exposed (with all it's fields). --- .../user_defined/user_defined_aggregates.rs | 8 +----- datafusion/ffi/src/udaf/accumulator_args.rs | 9 +++++++ datafusion/ffi/src/udaf/mod.rs | 2 ++ .../src/accumulator.rs | 6 ++++- .../functions-aggregate/benches/count.rs | 8 ++++-- .../benches/min_max_bytes.rs | 1 + datafusion/functions-aggregate/benches/sum.rs | 3 ++- .../src/approx_distinct.rs | 2 +- .../functions-aggregate/src/approx_median.rs | 2 +- .../src/approx_percentile_cont.rs | 13 +++++---- .../src/approx_percentile_cont_with_weight.rs | 23 +++++++++++++++- .../functions-aggregate/src/array_agg.rs | 18 ++++++++----- datafusion/functions-aggregate/src/average.rs | 27 ++++++++++--------- datafusion/functions-aggregate/src/count.rs | 6 +++-- datafusion/functions-aggregate/src/median.rs | 4 +-- .../functions-aggregate/src/nth_value.rs | 4 +-- datafusion/functions-aggregate/src/stddev.rs | 9 +++++-- .../functions-aggregate/src/string_agg.rs | 15 ++++++++++- datafusion/physical-expr/src/aggregate.rs | 17 +++++++++++- 19 files changed, 126 insertions(+), 51 deletions(-) diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs index 982b4804597e..62e8ab18b9be 100644 --- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs @@ -954,13 +954,7 @@ impl AggregateUDFImpl for MetadataBasedAggregateUdf { } fn accumulator(&self, acc_args: AccumulatorArgs) -> Result> { - let input_expr = acc_args - .exprs - .first() - .ok_or(exec_datafusion_err!("Expected one argument"))?; - let input_field = input_expr.return_field(acc_args.schema)?; - - let double_output = input_field + let double_output = acc_args.expr_fields[0] .metadata() .get("modify_values") .map(|v| v == "double_output") diff --git a/datafusion/ffi/src/udaf/accumulator_args.rs b/datafusion/ffi/src/udaf/accumulator_args.rs index 0302c26a2e6b..6ac0a0b21d2d 100644 --- a/datafusion/ffi/src/udaf/accumulator_args.rs +++ b/datafusion/ffi/src/udaf/accumulator_args.rs @@ -97,6 +97,7 @@ impl TryFrom> for FFI_AccumulatorArgs { pub struct ForeignAccumulatorArgs { pub return_field: FieldRef, pub schema: Schema, + pub expr_fields: Vec, pub ignore_nulls: bool, pub order_bys: Vec, pub is_reversed: bool, @@ -132,9 +133,15 @@ impl TryFrom for ForeignAccumulatorArgs { let exprs = parse_physical_exprs(&proto_def.expr, &task_ctx, &schema, &codex)?; + let expr_fields = exprs + .iter() + .map(|e| e.return_field(&schema)) + .collect::, _>>()?; + Ok(Self { return_field, schema, + expr_fields, ignore_nulls: proto_def.ignore_nulls, order_bys, is_reversed: value.is_reversed, @@ -150,6 +157,7 @@ impl<'a> From<&'a ForeignAccumulatorArgs> for AccumulatorArgs<'a> { Self { return_field: Arc::clone(&value.return_field), schema: &value.schema, + expr_fields: &value.expr_fields, ignore_nulls: value.ignore_nulls, order_bys: &value.order_bys, is_reversed: value.is_reversed, @@ -175,6 +183,7 @@ mod tests { let orig_args = AccumulatorArgs { return_field: Field::new("f", DataType::Float64, true).into(), schema: &schema, + expr_fields: &[Field::new("a", DataType::Int32, true).into()], ignore_nulls: false, order_bys: &[PhysicalSortExpr::new_default(col("a", &schema)?)], is_reversed: false, diff --git a/datafusion/ffi/src/udaf/mod.rs b/datafusion/ffi/src/udaf/mod.rs index 1ea1798c7c8b..ce5611590b67 100644 --- a/datafusion/ffi/src/udaf/mod.rs +++ b/datafusion/ffi/src/udaf/mod.rs @@ -705,6 +705,7 @@ mod tests { let acc_args = AccumulatorArgs { return_field: Field::new("f", DataType::Float64, true).into(), schema: &schema, + expr_fields: &[Field::new("a", DataType::Float64, true).into()], ignore_nulls: true, order_bys: &[PhysicalSortExpr::new_default(col("a", &schema)?)], is_reversed: false, @@ -782,6 +783,7 @@ mod tests { let acc_args = AccumulatorArgs { return_field: Field::new("f", DataType::Float64, true).into(), schema: &schema, + expr_fields: &[Field::new("a", DataType::Float64, true).into()], ignore_nulls: true, order_bys: &[PhysicalSortExpr::new_default(col("a", &schema)?)], is_reversed: false, diff --git a/datafusion/functions-aggregate-common/src/accumulator.rs b/datafusion/functions-aggregate-common/src/accumulator.rs index e0f7af1fb38e..8db0ab4133dc 100644 --- a/datafusion/functions-aggregate-common/src/accumulator.rs +++ b/datafusion/functions-aggregate-common/src/accumulator.rs @@ -30,7 +30,8 @@ pub struct AccumulatorArgs<'a> { /// The return field of the aggregate function. pub return_field: FieldRef, - /// The schema of the input arguments + /// Input schema to the aggregate function. If you need to check data type, nullability + /// or metadata of input arguments then you should use `expr_fields` below instead. pub schema: &'a Schema, /// Whether to ignore nulls. @@ -67,6 +68,9 @@ pub struct AccumulatorArgs<'a> { /// The physical expression of arguments the aggregate function takes. pub exprs: &'a [Arc], + + /// Fields corresponding to each expr (same order & length). + pub expr_fields: &'a [FieldRef], } impl AccumulatorArgs<'_> { diff --git a/datafusion/functions-aggregate/benches/count.rs b/datafusion/functions-aggregate/benches/count.rs index 37c7fad4bd32..2f42d66c7c38 100644 --- a/datafusion/functions-aggregate/benches/count.rs +++ b/datafusion/functions-aggregate/benches/count.rs @@ -33,15 +33,17 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; fn prepare_group_accumulator() -> Box { let schema = Arc::new(Schema::new(vec![Field::new("f", DataType::Int32, true)])); + let expr = col("f", &schema).unwrap(); let accumulator_args = AccumulatorArgs { return_field: Field::new("f", DataType::Int64, true).into(), schema: &schema, + expr_fields: &[expr.return_field(&schema).unwrap()], ignore_nulls: false, order_bys: &[], is_reversed: false, name: "COUNT(f)", is_distinct: false, - exprs: &[col("f", &schema).unwrap()], + exprs: &[expr], }; let count_fn = Count::new(); @@ -56,15 +58,17 @@ fn prepare_accumulator() -> Box { DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), true, )])); + let expr = col("f", &schema).unwrap(); let accumulator_args = AccumulatorArgs { return_field: Arc::new(Field::new_list_field(DataType::Int64, true)), schema: &schema, + expr_fields: &[expr.return_field(&schema).unwrap()], ignore_nulls: false, order_bys: &[], is_reversed: false, name: "COUNT(f)", is_distinct: true, - exprs: &[col("f", &schema).unwrap()], + exprs: &[expr], }; let count_fn = Count::new(); diff --git a/datafusion/functions-aggregate/benches/min_max_bytes.rs b/datafusion/functions-aggregate/benches/min_max_bytes.rs index a438ee5697a2..6d76ff2d0366 100644 --- a/datafusion/functions-aggregate/benches/min_max_bytes.rs +++ b/datafusion/functions-aggregate/benches/min_max_bytes.rs @@ -44,6 +44,7 @@ fn create_max_bytes_accumulator() -> Box { max.create_groups_accumulator(AccumulatorArgs { return_field: Arc::new(Field::new("value", DataType::Utf8, true)), schema: &input_schema, + expr_fields: &[Field::new("value", DataType::Utf8, true).into()], ignore_nulls: true, order_bys: &[], is_reversed: false, diff --git a/datafusion/functions-aggregate/benches/sum.rs b/datafusion/functions-aggregate/benches/sum.rs index a1e9894fb86c..6a21595927ec 100644 --- a/datafusion/functions-aggregate/benches/sum.rs +++ b/datafusion/functions-aggregate/benches/sum.rs @@ -31,8 +31,9 @@ fn prepare_accumulator(data_type: &DataType) -> Box { let field = Field::new("f", data_type.clone(), true).into(); let schema = Arc::new(Schema::new(vec![Arc::clone(&field)])); let accumulator_args = AccumulatorArgs { - return_field: field, + return_field: Arc::clone(&field), schema: &schema, + expr_fields: &[field], ignore_nulls: false, order_bys: &[], is_reversed: false, diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs b/datafusion/functions-aggregate/src/approx_distinct.rs index 9affdb3ee5f6..998f981deef7 100644 --- a/datafusion/functions-aggregate/src/approx_distinct.rs +++ b/datafusion/functions-aggregate/src/approx_distinct.rs @@ -361,7 +361,7 @@ impl AggregateUDFImpl for ApproxDistinct { } fn accumulator(&self, acc_args: AccumulatorArgs) -> Result> { - let data_type = acc_args.exprs[0].data_type(acc_args.schema)?; + let data_type = acc_args.expr_fields[0].data_type(); let accumulator: Box = match data_type { // TODO u8, i8, u16, i16 shall really be done using bitmap, not HLL diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs index 976f4d2c9480..530dbf3e43c7 100644 --- a/datafusion/functions-aggregate/src/approx_median.rs +++ b/datafusion/functions-aggregate/src/approx_median.rs @@ -134,7 +134,7 @@ impl AggregateUDFImpl for ApproxMedian { Ok(Box::new(ApproxPercentileAccumulator::new( 0.5_f64, - acc_args.exprs[0].data_type(acc_args.schema)?, + acc_args.expr_fields[0].data_type().clone(), ))) } diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 668280314e8d..6513504b30b0 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -187,9 +187,9 @@ impl ApproxPercentileCont { None }; - let data_type = args.exprs[0].data_type(args.schema)?; + let data_type = args.expr_fields[0].data_type(); let accumulator: ApproxPercentileAccumulator = match data_type { - t @ (DataType::UInt8 + DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 @@ -198,12 +198,11 @@ impl ApproxPercentileCont { | DataType::Int32 | DataType::Int64 | DataType::Float32 - | DataType::Float64) => { + | DataType::Float64 => { if let Some(max_size) = tdigest_max_size { - ApproxPercentileAccumulator::new_with_max_size(percentile, t, max_size) - }else{ - ApproxPercentileAccumulator::new(percentile, t) - + ApproxPercentileAccumulator::new_with_max_size(percentile, data_type.clone(), max_size) + } else { + ApproxPercentileAccumulator::new(percentile, data_type.clone()) } } other => { diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs index 89ff546039e5..215341b507af 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs @@ -220,7 +220,28 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight { Arc::clone(&acc_args.exprs[2]), // percentile ] }, - ..acc_args + expr_fields: if acc_args.exprs.len() == 4 { + &[ + Arc::clone(&acc_args.expr_fields[0]), // value + Arc::clone(&acc_args.expr_fields[2]), // percentile + Arc::clone(&acc_args.expr_fields[3]), // centroids + ] + } else { + &[ + Arc::clone(&acc_args.expr_fields[0]), // value + Arc::clone(&acc_args.expr_fields[2]), // percentile + ] + }, + // Unchanged below; we list each field explicitly in case we ever add more + // fields to AccumulatorArgs making it easier to see if changes are also + // needed here. + return_field: acc_args.return_field, + schema: acc_args.schema, + ignore_nulls: acc_args.ignore_nulls, + order_bys: acc_args.order_bys, + is_reversed: acc_args.is_reversed, + name: acc_args.name, + is_distinct: acc_args.is_distinct, }; let approx_percentile_cont_accumulator = self.approx_percentile_cont.create_accumulator(sub_args)?; diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index 4d8676f24a28..b830588d404b 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -162,9 +162,9 @@ impl AggregateUDFImpl for ArrayAgg { } fn accumulator(&self, acc_args: AccumulatorArgs) -> Result> { - let data_type = acc_args.exprs[0].data_type(acc_args.schema)?; - let ignore_nulls = - acc_args.ignore_nulls && acc_args.exprs[0].nullable(acc_args.schema)?; + let field = &acc_args.expr_fields[0]; + let data_type = field.data_type(); + let ignore_nulls = acc_args.ignore_nulls && field.is_nullable(); if acc_args.is_distinct { // Limitation similar to Postgres. The aggregation function can only mix @@ -191,7 +191,7 @@ impl AggregateUDFImpl for ArrayAgg { } }; return Ok(Box::new(DistinctArrayAggAccumulator::try_new( - &data_type, + data_type, sort_option, ignore_nulls, )?)); @@ -199,7 +199,7 @@ impl AggregateUDFImpl for ArrayAgg { let Some(ordering) = LexOrdering::new(acc_args.order_bys.to_vec()) else { return Ok(Box::new(ArrayAggAccumulator::try_new( - &data_type, + data_type, ignore_nulls, )?)); }; @@ -210,7 +210,7 @@ impl AggregateUDFImpl for ArrayAgg { .collect::>>()?; OrderSensitiveArrayAggAccumulator::try_new( - &data_type, + data_type, &ordering_dtypes, ordering, self.is_input_pre_ordered, @@ -802,6 +802,7 @@ mod tests { use datafusion_common::cast::as_generic_string_array; use datafusion_common::internal_err; use datafusion_physical_expr::expressions::Column; + use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; use std::sync::Arc; @@ -1159,15 +1160,18 @@ mod tests { } fn build(&self) -> Result> { + let expr = Arc::new(Column::new("col", 0)); + let expr_field = expr.return_field(&self.schema)?; ArrayAgg::default().accumulator(AccumulatorArgs { return_field: Arc::clone(&self.return_field), schema: &self.schema, + expr_fields: &[expr_field], ignore_nulls: false, order_bys: &self.order_bys, is_reversed: false, name: "", is_distinct: self.distinct, - exprs: &[Arc::new(Column::new("col", 0))], + exprs: &[expr], }) } diff --git a/datafusion/functions-aggregate/src/average.rs b/datafusion/functions-aggregate/src/average.rs index 11960779ed18..bec1734e2e20 100644 --- a/datafusion/functions-aggregate/src/average.rs +++ b/datafusion/functions-aggregate/src/average.rs @@ -184,12 +184,12 @@ impl AggregateUDFImpl for Avg { } fn accumulator(&self, acc_args: AccumulatorArgs) -> Result> { - let data_type = acc_args.exprs[0].data_type(acc_args.schema)?; + let data_type = acc_args.expr_fields[0].data_type(); use DataType::*; // instantiate specialized accumulator based for the type if acc_args.is_distinct { - match (&data_type, acc_args.return_type()) { + match (data_type, acc_args.return_type()) { // Numeric types are converted to Float64 via `coerce_avg_type` during logical plan creation (Float64, _) => Ok(Box::new(Float64DistinctAvgAccumulator::default())), @@ -362,12 +362,13 @@ impl AggregateUDFImpl for Avg { ) -> Result> { use DataType::*; - let data_type = args.exprs[0].data_type(args.schema)?; + let data_type = args.expr_fields[0].data_type(); + // instantiate specialized accumulator based for the type - match (&data_type, args.return_field.data_type()) { + match (data_type, args.return_field.data_type()) { (Float64, Float64) => { Ok(Box::new(AvgGroupsAccumulator::::new( - &data_type, + data_type, args.return_field.data_type(), |sum: f64, count: u64| Ok(sum / count as f64), ))) @@ -386,7 +387,7 @@ impl AggregateUDFImpl for Avg { move |sum: i32, count: u64| decimal_averager.avg(sum, count as i32); Ok(Box::new(AvgGroupsAccumulator::::new( - &data_type, + data_type, args.return_field.data_type(), avg_fn, ))) @@ -405,7 +406,7 @@ impl AggregateUDFImpl for Avg { move |sum: i64, count: u64| decimal_averager.avg(sum, count as i64); Ok(Box::new(AvgGroupsAccumulator::::new( - &data_type, + data_type, args.return_field.data_type(), avg_fn, ))) @@ -424,7 +425,7 @@ impl AggregateUDFImpl for Avg { move |sum: i128, count: u64| decimal_averager.avg(sum, count as i128); Ok(Box::new(AvgGroupsAccumulator::::new( - &data_type, + data_type, args.return_field.data_type(), avg_fn, ))) @@ -445,7 +446,7 @@ impl AggregateUDFImpl for Avg { }; Ok(Box::new(AvgGroupsAccumulator::::new( - &data_type, + data_type, args.return_field.data_type(), avg_fn, ))) @@ -459,7 +460,7 @@ impl AggregateUDFImpl for Avg { DurationSecondType, _, >::new( - &data_type, + data_type, args.return_type(), avg_fn, ))), @@ -467,7 +468,7 @@ impl AggregateUDFImpl for Avg { DurationMillisecondType, _, >::new( - &data_type, + data_type, args.return_type(), avg_fn, ))), @@ -475,7 +476,7 @@ impl AggregateUDFImpl for Avg { DurationMicrosecondType, _, >::new( - &data_type, + data_type, args.return_type(), avg_fn, ))), @@ -483,7 +484,7 @@ impl AggregateUDFImpl for Avg { DurationNanosecondType, _, >::new( - &data_type, + data_type, args.return_type(), avg_fn, ))), diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs index c0d2ba199a13..065635a891f3 100644 --- a/datafusion/functions-aggregate/src/count.rs +++ b/datafusion/functions-aggregate/src/count.rs @@ -333,7 +333,7 @@ impl AggregateUDFImpl for Count { return not_impl_err!("COUNT DISTINCT with multiple arguments"); } - let data_type = &acc_args.exprs[0].data_type(acc_args.schema)?; + let data_type = acc_args.expr_fields[0].data_type(); Ok(match data_type { DataType::Dictionary(_, values_type) => { @@ -854,7 +854,7 @@ mod tests { datatypes::{DataType, Field, Int32Type, Schema}, }; use datafusion_expr::function::AccumulatorArgs; - use datafusion_physical_expr::expressions::Column; + use datafusion_physical_expr::{expressions::Column, PhysicalExpr}; use std::sync::Arc; /// Helper function to create a dictionary array with non-null keys but some null values /// Returns a dictionary array where: @@ -895,8 +895,10 @@ mod tests { // Using Count UDAF's accumulator let count = Count::new(); let expr = Arc::new(Column::new("dict_col", 0)); + let expr_field = expr.return_field(&schema)?; let args = AccumulatorArgs { schema: &schema, + expr_fields: &[expr_field], exprs: &[expr], is_distinct: true, name: "count", diff --git a/datafusion/functions-aggregate/src/median.rs b/datafusion/functions-aggregate/src/median.rs index a65759594eac..9466c6affb96 100644 --- a/datafusion/functions-aggregate/src/median.rs +++ b/datafusion/functions-aggregate/src/median.rs @@ -162,7 +162,7 @@ impl AggregateUDFImpl for Median { }; } - let dt = acc_args.exprs[0].data_type(acc_args.schema)?; + let dt = acc_args.expr_fields[0].data_type().clone(); downcast_integer! { dt => (helper, dt), DataType::Float16 => helper!(Float16Type, dt), @@ -196,7 +196,7 @@ impl AggregateUDFImpl for Median { ); } - let dt = args.exprs[0].data_type(args.schema)?; + let dt = args.expr_fields[0].data_type().clone(); macro_rules! helper { ($t:ty, $dt:expr) => { diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs index b9dc498ee746..2f4f9371be58 100644 --- a/datafusion/functions-aggregate/src/nth_value.rs +++ b/datafusion/functions-aggregate/src/nth_value.rs @@ -160,8 +160,8 @@ impl AggregateUDFImpl for NthValueAgg { .map(|e| e.expr.data_type(acc_args.schema)) .collect::>>()?; - let data_type = acc_args.exprs[0].data_type(acc_args.schema)?; - NthValueAccumulator::try_new(n, &data_type, &ordering_dtypes, ordering) + let data_type = acc_args.expr_fields[0].data_type(); + NthValueAccumulator::try_new(n, data_type, &ordering_dtypes, ordering) .map(|acc| Box::new(acc) as _) } diff --git a/datafusion/functions-aggregate/src/stddev.rs b/datafusion/functions-aggregate/src/stddev.rs index 312d5f11b477..782524aa4d0a 100644 --- a/datafusion/functions-aggregate/src/stddev.rs +++ b/datafusion/functions-aggregate/src/stddev.rs @@ -443,26 +443,31 @@ mod tests { agg2: Arc, schema: &Schema, ) -> Result { + let expr = col("a", schema)?; + let expr_field = expr.return_field(schema)?; + let args1 = AccumulatorArgs { return_field: Field::new("f", DataType::Float64, true).into(), schema, + expr_fields: &[Arc::clone(&expr_field)], ignore_nulls: false, order_bys: &[], name: "a", is_distinct: false, is_reversed: false, - exprs: &[col("a", schema)?], + exprs: &[Arc::clone(&expr)], }; let args2 = AccumulatorArgs { return_field: Field::new("f", DataType::Float64, true).into(), schema, + expr_fields: &[expr_field], ignore_nulls: false, order_bys: &[], name: "a", is_distinct: false, is_reversed: false, - exprs: &[col("a", schema)?], + exprs: &[expr], }; let mut accum1 = agg1.accumulator(args1)?; diff --git a/datafusion/functions-aggregate/src/string_agg.rs b/datafusion/functions-aggregate/src/string_agg.rs index a091ed34da70..4a040df7b4a3 100644 --- a/datafusion/functions-aggregate/src/string_agg.rs +++ b/datafusion/functions-aggregate/src/string_agg.rs @@ -199,7 +199,16 @@ impl AggregateUDFImpl for StringAgg { ) .into(), exprs: &filter_index(acc_args.exprs, 1), - ..acc_args + expr_fields: &filter_index(acc_args.expr_fields, 1), + // Unchanged below; we list each field explicitly in case we ever add more + // fields to AccumulatorArgs making it easier to see if changes are also + // needed here. + schema: acc_args.schema, + ignore_nulls: acc_args.ignore_nulls, + order_bys: acc_args.order_bys, + is_reversed: acc_args.is_reversed, + name: acc_args.name, + is_distinct: acc_args.is_distinct, })?; Ok(Box::new(StringAggAccumulator::new( @@ -590,6 +599,10 @@ mod tests { StringAgg::new().accumulator(AccumulatorArgs { return_field: Field::new("f", DataType::LargeUtf8, true).into(), schema: &self.schema, + expr_fields: &[ + Field::new("col", DataType::LargeUtf8, true).into(), + Field::new("lit", DataType::Utf8, false).into(), + ], ignore_nulls: false, order_bys: &self.order_bys, is_reversed: false, diff --git a/datafusion/physical-expr/src/aggregate.rs b/datafusion/physical-expr/src/aggregate.rs index 19d2ecc924dd..2a8467eb8832 100644 --- a/datafusion/physical-expr/src/aggregate.rs +++ b/datafusion/physical-expr/src/aggregate.rs @@ -143,7 +143,7 @@ impl AggregateExprBuilder { /// # fn accumulator(&self, acc_args: AccumulatorArgs) -> Result> { /// # unimplemented!() /// # } - /// # + /// # /// # fn state_fields(&self, args: StateFieldsArgs) -> Result> { /// # unimplemented!() /// # } @@ -231,9 +231,15 @@ impl AggregateExprBuilder { Some(alias) => alias, }; + let arg_fields = args + .iter() + .map(|e| e.return_field(schema.as_ref())) + .collect::>>()?; + Ok(AggregateFunctionExpr { fun: Arc::unwrap_or_clone(fun), args, + arg_fields, return_field, name, human_display, @@ -306,6 +312,8 @@ impl AggregateExprBuilder { pub struct AggregateFunctionExpr { fun: AggregateUDF, args: Vec>, + /// Fields corresponding to args (same order & length) + arg_fields: Vec, /// Output / return field of this aggregate return_field: FieldRef, /// Output column name that this expression creates @@ -383,6 +391,7 @@ impl AggregateFunctionExpr { let acc_args = AccumulatorArgs { return_field: Arc::clone(&self.return_field), schema: &self.schema, + expr_fields: &self.arg_fields, ignore_nulls: self.ignore_nulls, order_bys: self.order_bys.as_ref(), is_distinct: self.is_distinct, @@ -467,6 +476,7 @@ impl AggregateFunctionExpr { let args = AccumulatorArgs { return_field: Arc::clone(&self.return_field), schema: &self.schema, + expr_fields: &self.arg_fields, ignore_nulls: self.ignore_nulls, order_bys: self.order_bys.as_ref(), is_distinct: self.is_distinct, @@ -536,6 +546,7 @@ impl AggregateFunctionExpr { let args = AccumulatorArgs { return_field: Arc::clone(&self.return_field), schema: &self.schema, + expr_fields: &self.arg_fields, ignore_nulls: self.ignore_nulls, order_bys: self.order_bys.as_ref(), is_distinct: self.is_distinct, @@ -555,6 +566,7 @@ impl AggregateFunctionExpr { let args = AccumulatorArgs { return_field: Arc::clone(&self.return_field), schema: &self.schema, + expr_fields: &self.arg_fields, ignore_nulls: self.ignore_nulls, order_bys: self.order_bys.as_ref(), is_distinct: self.is_distinct, @@ -638,6 +650,9 @@ impl AggregateFunctionExpr { Some(AggregateFunctionExpr { fun: self.fun.clone(), args, + // TODO: need to align arg_fields here with new args + // https://github.com/apache/datafusion/issues/18149 + arg_fields: self.arg_fields.clone(), return_field: Arc::clone(&self.return_field), name: self.name.clone(), // TODO: Human name should be updated after re-write to not mislead From 9fe6138089925c1f46844cc7865b3b9adf20b6c4 Mon Sep 17 00:00:00 2001 From: Pepijn Van Eeckhoudt Date: Thu, 30 Oct 2025 12:29:04 +0100 Subject: [PATCH 054/157] Add simple unit test for `merge` in case expression (#18369) ## Which issue does this PR close? - None, followup for #18152 ## Rationale for this change Add a unit test testing (and demonstrating) the merge function. ## What changes are included in this PR? Adds an additional test case ## Are these changes tested? Who tests the tests? ## Are there any user-facing changes? No --- .../physical-expr/src/expressions/case.rs | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index 0b4c3af1d9c5..d58b03842409 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -1954,4 +1954,35 @@ mod tests { Ok(()) } + + #[test] + fn test_merge() { + let a1 = StringArray::from(vec![Some("A")]).to_data(); + let a2 = StringArray::from(vec![Some("B")]).to_data(); + let a3 = StringArray::from(vec![Some("C"), Some("D")]).to_data(); + + let indices = vec![ + PartialResultIndex::none(), + PartialResultIndex::try_new(1).unwrap(), + PartialResultIndex::try_new(0).unwrap(), + PartialResultIndex::none(), + PartialResultIndex::try_new(2).unwrap(), + PartialResultIndex::try_new(2).unwrap(), + ]; + + let merged = merge(&vec![a1, a2, a3], &indices).unwrap(); + let merged = merged.as_string::(); + + assert_eq!(merged.len(), indices.len()); + assert!(!merged.is_valid(0)); + assert!(merged.is_valid(1)); + assert_eq!(merged.value(1), "B"); + assert!(merged.is_valid(2)); + assert_eq!(merged.value(2), "A"); + assert!(!merged.is_valid(3)); + assert!(merged.is_valid(4)); + assert_eq!(merged.value(4), "C"); + assert!(merged.is_valid(5)); + assert_eq!(merged.value(5), "D"); + } } From c2040416c2bdfc4c0618a2990329b185088ae984 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 30 Oct 2025 22:48:00 +1100 Subject: [PATCH 055/157] chore(deps): bump taiki-e/install-action from 2.62.40 to 2.62.41 (#18377) Bumps [taiki-e/install-action](https://github.com/taiki-e/install-action) from 2.62.40 to 2.62.41.
Release notes

Sourced from taiki-e/install-action's releases.

2.62.41

  • Update osv-scanner@latest to 2.2.4.

  • Update zizmor@latest to 1.16.1.

  • Update vacuum@latest to 0.19.2.

  • Update mise@latest to 2025.10.19.

Changelog

Sourced from taiki-e/install-action's changelog.

Changelog

All notable changes to this project will be documented in this file.

This project adheres to Semantic Versioning.

[Unreleased]

  • Update mise@latest to 2025.10.20.

  • Update cargo-nextest@latest to 0.9.109.

  • Update vacuum@latest to 0.19.4.

  • Update uv@latest to 0.9.6.

  • Update vacuum@latest to 0.19.3.

[2.62.41] - 2025-10-29

  • Update osv-scanner@latest to 2.2.4.

  • Update zizmor@latest to 1.16.1.

  • Update vacuum@latest to 0.19.2.

  • Update mise@latest to 2025.10.19.

[2.62.40] - 2025-10-28

  • Update wasm-bindgen@latest to 0.2.105.

[2.62.39] - 2025-10-27

  • Update vacuum@latest to 0.19.1.

  • Update cargo-shear@latest to 1.6.1.

  • Update cargo-binstall@latest to 1.15.9.

  • Update mise@latest to 2025.10.18.

[2.62.38] - 2025-10-25

  • Update coreutils@latest to 0.3.0.

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=taiki-e/install-action&package-manager=github_actions&previous-version=2.62.40&new-version=2.62.41)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/audit.yml | 2 +- .github/workflows/rust.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 40d4d4cfa380..0d87ff438f79 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Install cargo-audit - uses: taiki-e/install-action@41ef8c65f4034ff24ab1cc2cef52f3000bcf9523 # v2.62.40 + uses: taiki-e/install-action@1d76762916ba18e4f0c3b2f71fee3da83a279745 # v2.62.41 with: tool: cargo-audit - name: Run audit check diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 8a3563899fc6..fe7faf941242 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -434,7 +434,7 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@41ef8c65f4034ff24ab1cc2cef52f3000bcf9523 # v2.62.40 + uses: taiki-e/install-action@1d76762916ba18e4f0c3b2f71fee3da83a279745 # v2.62.41 with: tool: wasm-pack - name: Run tests with headless mode @@ -761,7 +761,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@41ef8c65f4034ff24ab1cc2cef52f3000bcf9523 # v2.62.40 + uses: taiki-e/install-action@1d76762916ba18e4f0c3b2f71fee3da83a279745 # v2.62.41 with: tool: cargo-msrv From 11b6b8511ef18f27f8add550f20a554f4211e39a Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 30 Oct 2025 18:49:13 +0200 Subject: [PATCH 056/157] feat: Add Hash trait to StatsType enum (#18382) ## Which issue does this PR close? N/A ## Rationale for this change To be able to use `derive(hash)` ## What changes are included in this PR? Add `Hash` to the `StatsType` enum ## Are these changes tested? No need ## Are there any user-facing changes? kinda --- datafusion/functions-aggregate-common/src/stats.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions-aggregate-common/src/stats.rs b/datafusion/functions-aggregate-common/src/stats.rs index bcd004db7831..593b105426be 100644 --- a/datafusion/functions-aggregate-common/src/stats.rs +++ b/datafusion/functions-aggregate-common/src/stats.rs @@ -17,7 +17,7 @@ /// TODO: Move this to functions-aggregate module /// Enum used for differentiating population and sample for statistical functions -#[derive(PartialEq, Eq, Debug, Clone, Copy)] +#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)] pub enum StatsType { /// Population Population, From 3b847772c0505d5c9957637f32b862f3cf358b38 Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Fri, 31 Oct 2025 01:06:52 +0800 Subject: [PATCH 057/157] feat: support get_field for map literal (#18371) ## Which issue does this PR close? ## Rationale for this change currently, get_field for map only supports column. ## What changes are included in this PR? support get_field for map literal ## Are these changes tested? UT ## Are there any user-facing changes? No --- datafusion/functions-nested/src/planner.rs | 5 +--- datafusion/sqllogictest/test_files/map.slt | 32 +++++++++++++++------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/datafusion/functions-nested/src/planner.rs b/datafusion/functions-nested/src/planner.rs index f4fa8630a8d3..4fec5e38065b 100644 --- a/datafusion/functions-nested/src/planner.rs +++ b/datafusion/functions-nested/src/planner.rs @@ -18,7 +18,6 @@ //! SQL planning extensions like [`NestedFunctionPlanner`] and [`FieldAccessPlanner`] use arrow::datatypes::DataType; -use datafusion_common::ExprSchema; use datafusion_common::{plan_err, utils::list_ndims, DFSchema, Result}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::expr::{AggregateFunction, AggregateFunctionParams}; @@ -177,9 +176,7 @@ impl ExprPlanner for FieldAccessPlanner { )), )), // special case for map access with - Expr::Column(ref c) - if matches!(schema.data_type(c)?, DataType::Map(_, _)) => - { + _ if matches!(expr.get_type(schema)?, DataType::Map(_, _)) => { Ok(PlannerResult::Planned(Expr::ScalarFunction( ScalarFunction::new_udf( get_field_inner(), diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt index fc21638b3f3c..949edb8376d1 100644 --- a/datafusion/sqllogictest/test_files/map.slt +++ b/datafusion/sqllogictest/test_files/map.slt @@ -526,11 +526,23 @@ SELECT MAP { 'a': 1, 'b': 3 }; query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type SELECT MAP { 'a': 1, 2: 3 }; -# TODO(https://github.com/apache/datafusion/issues/11785): fix accessing map with non-string key -# query ? -# SELECT MAP { 1: 'a', 2: 'b', 3: 'c' }[1]; -# ---- -# a +# accessing map with non-string key +query T +SELECT MAP { 1: 'a', 2: 'b', 3: 'c' }[1]; +---- +a + +# accessing map with string key +query I +SELECT MAP { 'a': 1, 'b': 2, 'c': 3 }['a']; +---- +1 + +# accessing map with non-string key in case expression +query I +SELECT (CASE WHEN 1 > 0 THEN MAP {'x': 100} ELSE MAP {'y': 200} END)['x']; +---- +100 # TODO(https://github.com/apache/datafusion/issues/11785): fix accessing map with non-string key # query ? @@ -538,11 +550,11 @@ SELECT MAP { 'a': 1, 2: 3 }; # ---- # 1 -# TODO(https://github.com/apache/datafusion/issues/11785): fix accessing map with non-string key -# query ? -# SELECT MAKE_MAP(1, null, 2, 33, 3, null)[2]; -# ---- -# 33 +# accessing map with non-string key +query I +SELECT MAKE_MAP(1, null, 2, 33, 3, null)[2]; +---- +33 ## cardinality From 7a002274a4a97d9964be8dca8e80f18fa262c626 Mon Sep 17 00:00:00 2001 From: Martin Hilton Date: Thu, 30 Oct 2025 18:42:44 +0000 Subject: [PATCH 058/157] fix: correct date_trunc for times before the epoch (#18356) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes #18334. ## Rationale for this change ## What changes are included in this PR? The array-based implementation of date_trunc can produce incorrect results for negative timestamps (i.e. dates before 1970-01-01). Check for any such incorrect values and compensate accordingly. Running the date_trunc benchmark suggests this fix introduces an ~9% performance cost. ``` date_trunc_minute_1000 time: [1.7424 µs 1.7495 µs 1.7583 µs] change: [+7.9289% +8.5950% +9.1955%] (p = 0.00 < 0.05) Performance has regressed. Found 4 outliers among 100 measurements (4.00%) 1 (1.00%) low mild 1 (1.00%) high mild 2 (2.00%) high severe ``` ## Are these changes tested? Yes, an SLT is added based on the issue. ## Are there any user-facing changes? --- .../functions/src/datetime/date_trunc.rs | 60 ++++++++++--------- .../sqllogictest/test_files/timestamps.slt | 24 ++++++++ 2 files changed, 56 insertions(+), 28 deletions(-) diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index 405aabfde991..543ed8038b2f 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -16,6 +16,7 @@ // under the License. use std::any::Any; +use std::num::NonZeroI64; use std::ops::{Add, Sub}; use std::str::FromStr; use std::sync::Arc; @@ -28,7 +29,7 @@ use arrow::array::types::{ ArrowTimestampType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, }; -use arrow::array::{Array, ArrayRef, Int64Array, PrimitiveArray}; +use arrow::array::{Array, ArrayRef, PrimitiveArray}; use arrow::datatypes::DataType::{self, Null, Timestamp, Utf8, Utf8View}; use arrow::datatypes::TimeUnit::{self, Microsecond, Millisecond, Nanosecond, Second}; use datafusion_common::cast::as_primitive_array; @@ -456,37 +457,40 @@ fn general_date_trunc_array_fine_granularity( granularity: &str, ) -> Result { let unit = match (tu, granularity) { - (Second, "minute") => Some(Int64Array::new_scalar(60)), - (Second, "hour") => Some(Int64Array::new_scalar(3600)), - (Second, "day") => Some(Int64Array::new_scalar(86400)), - - (Millisecond, "second") => Some(Int64Array::new_scalar(1_000)), - (Millisecond, "minute") => Some(Int64Array::new_scalar(60_000)), - (Millisecond, "hour") => Some(Int64Array::new_scalar(3_600_000)), - (Millisecond, "day") => Some(Int64Array::new_scalar(86_400_000)), - - (Microsecond, "millisecond") => Some(Int64Array::new_scalar(1_000)), - (Microsecond, "second") => Some(Int64Array::new_scalar(1_000_000)), - (Microsecond, "minute") => Some(Int64Array::new_scalar(60_000_000)), - (Microsecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000)), - (Microsecond, "day") => Some(Int64Array::new_scalar(86_400_000_000)), - - (Nanosecond, "microsecond") => Some(Int64Array::new_scalar(1_000)), - (Nanosecond, "millisecond") => Some(Int64Array::new_scalar(1_000_000)), - (Nanosecond, "second") => Some(Int64Array::new_scalar(1_000_000_000)), - (Nanosecond, "minute") => Some(Int64Array::new_scalar(60_000_000_000)), - (Nanosecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000_000)), - (Nanosecond, "day") => Some(Int64Array::new_scalar(86_400_000_000_000)), + (Second, "minute") => NonZeroI64::new(60), + (Second, "hour") => NonZeroI64::new(3600), + (Second, "day") => NonZeroI64::new(86400), + + (Millisecond, "second") => NonZeroI64::new(1_000), + (Millisecond, "minute") => NonZeroI64::new(60_000), + (Millisecond, "hour") => NonZeroI64::new(3_600_000), + (Millisecond, "day") => NonZeroI64::new(86_400_000), + + (Microsecond, "millisecond") => NonZeroI64::new(1_000), + (Microsecond, "second") => NonZeroI64::new(1_000_000), + (Microsecond, "minute") => NonZeroI64::new(60_000_000), + (Microsecond, "hour") => NonZeroI64::new(3_600_000_000), + (Microsecond, "day") => NonZeroI64::new(86_400_000_000), + + (Nanosecond, "microsecond") => NonZeroI64::new(1_000), + (Nanosecond, "millisecond") => NonZeroI64::new(1_000_000), + (Nanosecond, "second") => NonZeroI64::new(1_000_000_000), + (Nanosecond, "minute") => NonZeroI64::new(60_000_000_000), + (Nanosecond, "hour") => NonZeroI64::new(3_600_000_000_000), + (Nanosecond, "day") => NonZeroI64::new(86_400_000_000_000), _ => None, }; if let Some(unit) = unit { - let original_type = array.data_type(); - let array = arrow::compute::cast(array, &DataType::Int64)?; - let array = arrow::compute::kernels::numeric::div(&array, &unit)?; - let array = arrow::compute::kernels::numeric::mul(&array, &unit)?; - let array = arrow::compute::cast(&array, original_type)?; - Ok(array) + let unit = unit.get(); + let array = PrimitiveArray::::from_iter_values_with_nulls( + array + .values() + .iter() + .map(|v| *v - i64::rem_euclid(*v, unit)), + array.nulls().cloned(), + ); + Ok(Arc::new(array)) } else { // truncate to the same or smaller unit Ok(Arc::new(array.clone())) diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 84dd7098a2ee..250d4e9830e5 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -1687,6 +1687,30 @@ SELECT DATE_TRUNC('second', '2022-08-03 14:38:50Z'); ---- 2022-08-03T14:38:50 +# DATE_TRUNC handling of times before the unix epoch (issue 18334) +query PPPPPPPPPPP +SELECT + d, + DATE_TRUNC('year', d), + DATE_TRUNC('quarter', d), + DATE_TRUNC('month', d), + DATE_TRUNC('week', d), + DATE_TRUNC('day', d), + DATE_TRUNC('hour', d), + DATE_TRUNC('minute', d), + DATE_TRUNC('second', d), + DATE_TRUNC('millisecond', d), + DATE_TRUNC('microsecond', d), +FROM (VALUES + (TIMESTAMP '1900-06-15 07:09:00'), + (TIMESTAMP '1970-01-01 00:00:00'), + (TIMESTAMP '2024-12-31 23:39:01.123456789') +) AS t(d); +---- +1900-06-15T07:09:00 1900-01-01T00:00:00 1900-04-01T00:00:00 1900-06-01T00:00:00 1900-06-11T00:00:00 1900-06-15T00:00:00 1900-06-15T07:00:00 1900-06-15T07:09:00 1900-06-15T07:09:00 1900-06-15T07:09:00 1900-06-15T07:09:00 +1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1969-12-29T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 1970-01-01T00:00:00 +2024-12-31T23:39:01.123456789 2024-01-01T00:00:00 2024-10-01T00:00:00 2024-12-01T00:00:00 2024-12-30T00:00:00 2024-12-31T00:00:00 2024-12-31T23:00:00 2024-12-31T23:39:00 2024-12-31T23:39:01 2024-12-31T23:39:01.123 2024-12-31T23:39:01.123456 + # Test that interval can add a timestamp query P SELECT timestamp '2013-07-01 12:00:00' + INTERVAL '8' DAY; From d36f8e7948ee54058c160f7e8b41b511ed2e8264 Mon Sep 17 00:00:00 2001 From: XL Liang Date: Fri, 31 Oct 2025 02:44:55 +0800 Subject: [PATCH 059/157] fix: Preserve percent-encoding in `PartitionedFile` paths during deserialization (#18346) ## Which issue does this PR close? - Closes #18345 ## Rationale for this change ## What changes are included in this PR? This PR changes the implementation to use Path::parse(proto.path). As per the object_store crate's documentation, Path::parse is the correct method for constructing a Path from a raw, already-encoded string, as it preserves the encoding. ## Are these changes tested? Yes, with unit tests. ## Are there any user-facing changes? No. --- .../proto/src/physical_plan/from_proto.rs | 56 ++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 2a3906d49347..349ed79ddb4a 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -572,7 +572,9 @@ impl TryFrom<&protobuf::PartitionedFile> for PartitionedFile { fn try_from(val: &protobuf::PartitionedFile) -> Result { Ok(PartitionedFile { object_meta: ObjectMeta { - location: Path::from(val.path.as_str()), + location: Path::parse(val.path.as_str()).map_err(|e| { + proto_error(format!("Invalid object_store path: {e}")) + })?, last_modified: Utc.timestamp_nanos(val.last_modified_ns as i64), size: val.size, e_tag: None, @@ -694,3 +696,55 @@ impl TryFrom<&protobuf::FileSinkConfig> for FileSinkConfig { }) } } + +#[cfg(test)] +mod tests { + use super::*; + use chrono::{TimeZone, Utc}; + use datafusion_datasource::PartitionedFile; + use object_store::path::Path; + use object_store::ObjectMeta; + + #[test] + fn partitioned_file_path_roundtrip_percent_encoded() { + let path_str = "foo/foo%2Fbar/baz%252Fqux"; + let pf = PartitionedFile { + object_meta: ObjectMeta { + location: Path::parse(path_str).unwrap(), + last_modified: Utc.timestamp_nanos(1_000), + size: 42, + e_tag: None, + version: None, + }, + partition_values: vec![], + range: None, + statistics: None, + extensions: None, + metadata_size_hint: None, + }; + + let proto = protobuf::PartitionedFile::try_from(&pf).unwrap(); + assert_eq!(proto.path, path_str); + + let pf2 = PartitionedFile::try_from(&proto).unwrap(); + assert_eq!(pf2.object_meta.location.as_ref(), path_str); + assert_eq!(pf2.object_meta.location, pf.object_meta.location); + assert_eq!(pf2.object_meta.size, pf.object_meta.size); + assert_eq!(pf2.object_meta.last_modified, pf.object_meta.last_modified); + } + + #[test] + fn partitioned_file_from_proto_invalid_path() { + let proto = protobuf::PartitionedFile { + path: "foo//bar".to_string(), + size: 1, + last_modified_ns: 0, + partition_values: vec![], + range: None, + statistics: None, + }; + + let err = PartitionedFile::try_from(&proto).unwrap_err(); + assert!(err.to_string().contains("Invalid object_store path")); + } +} From d0d8c0ff731b23e51859c666c5076cbd532bf8bc Mon Sep 17 00:00:00 2001 From: Christopher Watford Date: Thu, 30 Oct 2025 14:45:20 -0400 Subject: [PATCH 060/157] fix: SortPreservingMerge sanity check rejects valid ORDER BY with CASE expression (#18342) ## Which issue does this PR close? - Closes #18327 ## Rationale for this change ORDER BY with a CASE statement didn't always work, raising a sanity check error in SortPreservingMergeExec. The plan showed that the partitions all had the same ordering, but for whatever reason they were not detected as being equal. Using a single partition succeeded always. ## What changes are included in this PR? The changes are non-obvious and I spent a lot of time bisecting/debug printing and landed on a failure in bounds checking with boolean interval arithmetic. Returning UNCERTAIN if either leg of the interval is NULL resolves the upstream issue where CASE statements end up being deemed Unordered. My rust-fu is hobbyist at best, so while this appears to resolve my issue I cannot for-certain exclaim that I've solved it all (Claude 4.5 agrees with my fix, but that's not an indication its any good). I'm also reasonably certain my unit tests are more ham fisted than necessary. ## Are these changes tested? 1. Yes, unit tests have been added. ## Are there any user-facing changes? This does not change any behavior beyond resolving a bug with a valid SQL statement. --------- Co-authored-by: Andrew Lamb --- .../expr-common/src/interval_arithmetic.rs | 66 ++++++++++++++++++- datafusion/sqllogictest/test_files/union.slt | 37 +++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index 40c44cfb3ca2..7515b59b9221 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -583,7 +583,9 @@ impl Interval { upper: ScalarValue::Boolean(Some(upper)), }) } - _ => internal_err!("Incompatible data types for logical conjunction"), + + // Return UNCERTAIN when intervals don't have concrete boolean bounds + _ => Ok(Self::UNCERTAIN), } } @@ -606,7 +608,9 @@ impl Interval { upper: ScalarValue::Boolean(Some(upper)), }) } - _ => internal_err!("Incompatible data types for logical disjunction"), + + // Return UNCERTAIN when intervals don't have concrete boolean bounds + _ => Ok(Self::UNCERTAIN), } } @@ -2517,6 +2521,64 @@ mod tests { Ok(()) } + #[test] + fn test_and_or_with_normalized_boolean_intervals() -> Result<()> { + // Verify that NULL boolean bounds are normalized and don't cause errors + let from_nulls = + Interval::try_new(ScalarValue::Boolean(None), ScalarValue::Boolean(None))?; + + assert!(from_nulls.or(&Interval::CERTAINLY_TRUE).is_ok()); + assert!(from_nulls.and(&Interval::CERTAINLY_FALSE).is_ok()); + + Ok(()) + } + + #[test] + fn test_and_null_boolean_intervals() -> Result<()> { + let null_interval = + Interval::try_new(ScalarValue::Boolean(None), ScalarValue::Boolean(None))?; + + let and_result = null_interval.and(&Interval::CERTAINLY_FALSE)?; + assert_eq!(and_result, Interval::CERTAINLY_FALSE); + + let and_result = Interval::CERTAINLY_FALSE.and(&null_interval)?; + assert_eq!(and_result, Interval::CERTAINLY_FALSE); + + let and_result = null_interval.and(&Interval::CERTAINLY_TRUE)?; + assert_eq!(and_result, Interval::UNCERTAIN); + + let and_result = Interval::CERTAINLY_TRUE.and(&null_interval)?; + assert_eq!(and_result, Interval::UNCERTAIN); + + let and_result = null_interval.and(&null_interval)?; + assert_eq!(and_result, Interval::UNCERTAIN); + + Ok(()) + } + + #[test] + fn test_or_null_boolean_intervals() -> Result<()> { + let null_interval = + Interval::try_new(ScalarValue::Boolean(None), ScalarValue::Boolean(None))?; + + let or_result = null_interval.or(&Interval::CERTAINLY_FALSE)?; + assert_eq!(or_result, Interval::UNCERTAIN); + + let or_result = Interval::CERTAINLY_FALSE.or(&null_interval)?; + assert_eq!(or_result, Interval::UNCERTAIN); + + let or_result = null_interval.or(&Interval::CERTAINLY_TRUE)?; + assert_eq!(or_result, Interval::CERTAINLY_TRUE); + + let or_result = Interval::CERTAINLY_TRUE.or(&null_interval)?; + assert_eq!(or_result, Interval::CERTAINLY_TRUE); + + let or_result = null_interval.or(&null_interval)?; + assert_eq!(or_result, Interval::UNCERTAIN); + + Ok(()) + } + #[test] fn intersect_test() -> Result<()> { let possible_cases = vec![ diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index 75db459b1881..0c8b8c6edb1f 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -953,3 +953,40 @@ drop table u1; statement count 0 drop table u2; + +# repro for https://github.com/apache/datafusion/issues/18327 +# should not error +query TITT + WITH typ(oid, typnamespace, typname, typtype) AS ( + SELECT * FROM (VALUES (1, 10, 't1', 'b')) + UNION ALL SELECT * FROM (VALUES (2, NULL, 't2', 'b')) + UNION ALL SELECT * FROM (VALUES (3, 12, 't3', NULL)) + ) + , ns(oid, nspname) AS (VALUES (1, 'ns1'), (2, 'ns2')) + SELECT ns.nspname, typ.oid, typ.typname, typ.typtype + FROM typ JOIN ns ON (ns.oid = typ.typnamespace) + WHERE typ.typtype IN ('b','r','m','e','d') + ORDER BY CASE WHEN typ.typtype IN ('b','e','p') THEN 0 + WHEN typ.typtype = 'r' THEN 1 + END +---- + +# Add another row with a non-NULL value `m` which is retained by the +# filter but not matching any WHEN branch m? +query TITT + WITH typ(oid, typnamespace, typname, typtype) AS ( + SELECT * FROM (VALUES (1, 10, 't1', 'b')) + UNION ALL SELECT * FROM (VALUES (2, NULL, 't2', 'b')) + UNION ALL SELECT * FROM (VALUES (3, 12, 't3', NULL)) + UNION ALL SELECT * FROM (VALUES (4, 40, 't3', 'm')) + ), ns(oid, nspname) AS ( + VALUES (1, 'ns1'), (2, 'ns2'), (40, 'ns3') + ) + SELECT ns.nspname, typ.oid, typ.typname, typ.typtype + FROM typ JOIN ns ON (ns.oid = typ.typnamespace) + WHERE typ.typtype IN ('b','r','m','e','d') + ORDER BY CASE WHEN typ.typtype IN ('b','e','p') THEN 0 + WHEN typ.typtype = 'r' THEN 1 + END +---- +ns3 4 t3 m From a87235f3348a00ba561c1a1a2630bc1e94942626 Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Fri, 31 Oct 2025 05:45:45 +1100 Subject: [PATCH 061/157] Refactor `range`/`gen_series` signature away from user defined (#18317) ## Which issue does this PR close? - Closes #15881 - See my notes below ## Rationale for this change Trying to move away from user defined signatures where possible; mainly to ensure consistency of error checking/messages. The original issue is because the function has to do this checking itself leading to inconsistency of error used (ideally shouldn't be internal). By uplifting away from a user defined signature we can make use of existing code meant to handle this checking and error messages for us. ## What changes are included in this PR? Defined range/generate_series signature via coercible API instead of being user defined. Some accompanying changes are needed in the signature code to make this possible. ## Are these changes tested? Added SLT tests and fixed any existing ones. ## Are there any user-facing changes? No (error messages do change though) --- datafusion/common/src/types/builtin.rs | 35 +++ datafusion/common/src/types/native.rs | 5 + datafusion/expr-common/src/signature.rs | 6 +- datafusion/functions-nested/src/range.rs | 217 +++++++++++-------- datafusion/sqllogictest/test_files/array.slt | 44 +++- 5 files changed, 210 insertions(+), 97 deletions(-) diff --git a/datafusion/common/src/types/builtin.rs b/datafusion/common/src/types/builtin.rs index ec69db790377..314529b99a34 100644 --- a/datafusion/common/src/types/builtin.rs +++ b/datafusion/common/src/types/builtin.rs @@ -15,9 +15,17 @@ // specific language governing permissions and limitations // under the License. +use arrow::datatypes::IntervalUnit::*; + use crate::types::{LogicalTypeRef, NativeType}; use std::sync::{Arc, LazyLock}; +/// Create a singleton and accompanying static variable for a [`LogicalTypeRef`] +/// of a [`NativeType`]. +/// * `name`: name of the static variable, must be unique. +/// * `getter`: name of the public function that will return the singleton instance +/// of the static variable. +/// * `ty`: the [`NativeType`]. macro_rules! singleton { ($name:ident, $getter:ident, $ty:ident) => { static $name: LazyLock = @@ -31,6 +39,26 @@ macro_rules! singleton { }; } +/// Similar to [`singleton`], but for native types that have variants, such as +/// `NativeType::Interval(MonthDayNano)`. +/// * `name`: name of the static variable, must be unique. +/// * `getter`: name of the public function that will return the singleton instance +/// of the static variable. +/// * `ty`: the [`NativeType`]. +/// * `variant`: specific variant of the `ty`. +macro_rules! singleton_variant { + ($name:ident, $getter:ident, $ty:ident, $variant:ident) => { + static $name: LazyLock = + LazyLock::new(|| Arc::new(NativeType::$ty($variant))); + + #[doc = "Getter for singleton instance of a logical type representing"] + #[doc = concat!("[`NativeType::", stringify!($ty), "`] of unit [`", stringify!($variant),"`].`")] + pub fn $getter() -> LogicalTypeRef { + Arc::clone(&$name) + } + }; +} + singleton!(LOGICAL_NULL, logical_null, Null); singleton!(LOGICAL_BOOLEAN, logical_boolean, Boolean); singleton!(LOGICAL_INT8, logical_int8, Int8); @@ -47,3 +75,10 @@ singleton!(LOGICAL_FLOAT64, logical_float64, Float64); singleton!(LOGICAL_DATE, logical_date, Date); singleton!(LOGICAL_BINARY, logical_binary, Binary); singleton!(LOGICAL_STRING, logical_string, String); + +singleton_variant!( + LOGICAL_INTERVAL_MDN, + logical_interval_mdn, + Interval, + MonthDayNano +); diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 5cef0adfbde8..8c41701ae576 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -486,4 +486,9 @@ impl NativeType { pub fn is_binary(&self) -> bool { matches!(self, NativeType::Binary | NativeType::FixedSizeBinary(_)) } + + #[inline] + pub fn is_null(&self) -> bool { + matches!(self, NativeType::Null) + } } diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 5cb7a17ee312..2bf7092dd222 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -382,10 +382,7 @@ impl TypeSignatureClass { } /// Does the specified `NativeType` match this type signature class? - pub fn matches_native_type( - self: &TypeSignatureClass, - logical_type: &NativeType, - ) -> bool { + pub fn matches_native_type(&self, logical_type: &NativeType) -> bool { if logical_type == &NativeType::Null { return true; } @@ -431,6 +428,7 @@ impl TypeSignatureClass { TypeSignatureClass::Binary if native_type.is_binary() => { Ok(origin_type.to_owned()) } + _ if native_type.is_null() => Ok(origin_type.to_owned()), _ => internal_err!("May miss the matching logic in `matches_native_type`"), } } diff --git a/datafusion/functions-nested/src/range.rs b/datafusion/functions-nested/src/range.rs index 01c6e9c43f2e..e570ecf97420 100644 --- a/datafusion/functions-nested/src/range.rs +++ b/datafusion/functions-nested/src/range.rs @@ -18,33 +18,39 @@ //! [`ScalarUDFImpl`] definitions for range and gen_series functions. use crate::utils::make_scalar_function; -use arrow::array::{ - builder::{Date32Builder, TimestampNanosecondBuilder}, - temporal_conversions::as_datetime_with_timezone, - timezone::Tz, - types::{Date32Type, IntervalMonthDayNanoType, TimestampNanosecondType}, - Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullBufferBuilder, -}; use arrow::buffer::OffsetBuffer; -use arrow::datatypes::{ - DataType, DataType::*, Field, IntervalUnit::MonthDayNano, TimeUnit::Nanosecond, +use arrow::datatypes::TimeUnit; +use arrow::datatypes::{DataType, Field, IntervalUnit::MonthDayNano}; +use arrow::{ + array::{ + builder::{Date32Builder, TimestampNanosecondBuilder}, + temporal_conversions::as_datetime_with_timezone, + timezone::Tz, + types::{Date32Type, IntervalMonthDayNanoType, TimestampNanosecondType}, + Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullBufferBuilder, + }, + compute::cast, }; +use datafusion_common::internal_err; use datafusion_common::{ cast::{ as_date32_array, as_int64_array, as_interval_mdn_array, as_timestamp_nanosecond_array, }, - DataFusionError, ScalarValue, + types::{ + logical_date, logical_int64, logical_interval_mdn, logical_string, NativeType, + }, + ScalarValue, }; use datafusion_common::{ exec_datafusion_err, exec_err, not_impl_datafusion_err, utils::take_function_args, Result, }; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, + TypeSignatureClass, Volatility, }; use datafusion_macros::user_doc; -use itertools::Itertools; use std::any::Any; use std::cmp::Ordering; use std::iter::from_fn; @@ -146,10 +152,52 @@ impl Default for Range { } impl Range { + fn defined_signature() -> Signature { + // We natively only support i64 in our implementation; so ensure we cast other integer + // types to it. + let integer = Coercion::new_implicit( + TypeSignatureClass::Native(logical_int64()), + vec![TypeSignatureClass::Integer], + NativeType::Int64, + ); + // We natively only support mdn in our implementation; so ensure we cast other interval + // types to it. + let interval = Coercion::new_implicit( + TypeSignatureClass::Native(logical_interval_mdn()), + vec![TypeSignatureClass::Interval], + NativeType::Interval(MonthDayNano), + ); + // Ideally we'd limit to only Date32 & Timestamp(Nanoseconds) as those are the implementations + // we have but that is difficult to do with this current API; we'll cast later on to + // handle such types. + let date = Coercion::new_implicit( + TypeSignatureClass::Native(logical_date()), + vec![TypeSignatureClass::Native(logical_string())], + NativeType::Date, + ); + let timestamp = Coercion::new_exact(TypeSignatureClass::Timestamp); + Signature::one_of( + vec![ + // Integer ranges + // Stop + TypeSignature::Coercible(vec![integer.clone()]), + // Start & stop + TypeSignature::Coercible(vec![integer.clone(), integer.clone()]), + // Start, stop & step + TypeSignature::Coercible(vec![integer.clone(), integer.clone(), integer]), + // Date range + TypeSignature::Coercible(vec![date.clone(), date, interval.clone()]), + // Timestamp range + TypeSignature::Coercible(vec![timestamp.clone(), timestamp, interval]), + ], + Volatility::Immutable, + ) + } + /// Generate `range()` function which excludes upper bound. pub fn new() -> Self { Self { - signature: Signature::user_defined(Volatility::Immutable), + signature: Self::defined_signature(), include_upper_bound: false, } } @@ -157,7 +205,7 @@ impl Range { /// Generate `generate_series()` function which includes upper bound. fn generate_series() -> Self { Self { - signature: Signature::user_defined(Volatility::Immutable), + signature: Self::defined_signature(), include_upper_bound: true, } } @@ -180,39 +228,27 @@ impl ScalarUDFImpl for Range { &self.signature } - fn coerce_types(&self, arg_types: &[DataType]) -> Result> { - arg_types - .iter() - .map(|arg_type| match arg_type { - Null => Ok(Null), - Int8 => Ok(Int64), - Int16 => Ok(Int64), - Int32 => Ok(Int64), - Int64 => Ok(Int64), - UInt8 => Ok(Int64), - UInt16 => Ok(Int64), - UInt32 => Ok(Int64), - UInt64 => Ok(Int64), - Timestamp(_, tz) => Ok(Timestamp(Nanosecond, tz.clone())), - Date32 => Ok(Date32), - Date64 => Ok(Date32), - Utf8 => Ok(Date32), - LargeUtf8 => Ok(Date32), - Utf8View => Ok(Date32), - Interval(_) => Ok(Interval(MonthDayNano)), - _ => exec_err!("Unsupported DataType"), - }) - .try_collect() - } - fn return_type(&self, arg_types: &[DataType]) -> Result { if arg_types.iter().any(|t| t.is_null()) { - Ok(Null) - } else { - Ok(List(Arc::new(Field::new_list_field( + return Ok(DataType::Null); + } + + match (&arg_types[0], arg_types.get(1)) { + // In implementation we downcast to Date32 so ensure reflect that here + (_, Some(DataType::Date64)) | (DataType::Date64, _) => Ok(DataType::List( + Arc::new(Field::new_list_field(DataType::Date32, true)), + )), + // Ensure we preserve timezone + (DataType::Timestamp(_, tz), _) => { + Ok(DataType::List(Arc::new(Field::new_list_field( + DataType::Timestamp(TimeUnit::Nanosecond, tz.to_owned()), + true, + )))) + } + _ => Ok(DataType::List(Arc::new(Field::new_list_field( arg_types[0].clone(), true, - )))) + )))), } } @@ -226,13 +262,20 @@ impl ScalarUDFImpl for Range { return Ok(ColumnarValue::Scalar(ScalarValue::Null)); } match args[0].data_type() { - Int64 => make_scalar_function(|args| self.gen_range_inner(args))(args), - Date32 => make_scalar_function(|args| self.gen_range_date(args))(args), - Timestamp(_, _) => { + DataType::Int64 => { + make_scalar_function(|args| self.gen_range_inner(args))(args) + } + DataType::Date32 | DataType::Date64 => { + make_scalar_function(|args| self.gen_range_date(args))(args) + } + DataType::Timestamp(_, _) => { make_scalar_function(|args| self.gen_range_timestamp(args))(args) } dt => { - exec_err!("unsupported type for {}. Expected Int64, Date32 or Timestamp, got: {dt}", self.name()) + internal_err!( + "Signature failed to guard unknown input type for {}: {dt}", + self.name() + ) } } } @@ -274,7 +317,7 @@ impl Range { as_int64_array(stop_array)?, Some(as_int64_array(step_array)?), ), - _ => return exec_err!("{} expects 1 to 3 arguments", self.name()), + _ => return internal_err!("{} expects 1 to 3 arguments", self.name()), }; let mut values = vec![]; @@ -310,7 +353,7 @@ impl Range { }; } let arr = Arc::new(ListArray::try_new( - Arc::new(Field::new_list_field(Int64, true)), + Arc::new(Field::new_list_field(DataType::Int64, true)), OffsetBuffer::new(offsets.into()), Arc::new(Int64Array::from(values)), valid.finish(), @@ -320,29 +363,28 @@ impl Range { fn gen_range_date(&self, args: &[ArrayRef]) -> Result { let [start, stop, step] = take_function_args(self.name(), args)?; + let step = as_interval_mdn_array(step)?; - let (start_array, stop_array, step_array) = ( - as_date32_array(start)?, - as_date32_array(stop)?, - as_interval_mdn_array(step)?, - ); + // Signature can only guarantee we get a date type, not specifically + // date32 so handle potential cast from date64 here. + let start = cast(start, &DataType::Date32)?; + let start = as_date32_array(&start)?; + let stop = cast(stop, &DataType::Date32)?; + let stop = as_date32_array(&stop)?; // values are date32s let values_builder = Date32Builder::new(); let mut list_builder = ListBuilder::new(values_builder); - for idx in 0..stop_array.len() { - if start_array.is_null(idx) - || stop_array.is_null(idx) - || step_array.is_null(idx) - { + for idx in 0..stop.len() { + if start.is_null(idx) || stop.is_null(idx) || step.is_null(idx) { list_builder.append_null(); continue; } - let start = start_array.value(idx); - let stop = stop_array.value(idx); - let step = step_array.value(idx); + let start = start.value(idx); + let stop = stop.value(idx); + let step = step.value(idx); let (months, days, _) = IntervalMonthDayNanoType::to_parts(step); if months == 0 && days == 0 { @@ -378,44 +420,45 @@ impl Range { fn gen_range_timestamp(&self, args: &[ArrayRef]) -> Result { let [start, stop, step] = take_function_args(self.name(), args)?; + let step = as_interval_mdn_array(step)?; + + // Signature can only guarantee we get a timestamp type, not specifically + // timestamp(ns) so handle potential cast from other timestamps here. + fn cast_to_ns(arr: &ArrayRef) -> Result { + match arr.data_type() { + DataType::Timestamp(TimeUnit::Nanosecond, _) => Ok(Arc::clone(arr)), + DataType::Timestamp(_, tz) => Ok(cast( + arr, + &DataType::Timestamp(TimeUnit::Nanosecond, tz.to_owned()), + )?), + _ => unreachable!(), + } + } + let start = cast_to_ns(start)?; + let start = as_timestamp_nanosecond_array(&start)?; + let stop = cast_to_ns(stop)?; + let stop = as_timestamp_nanosecond_array(&stop)?; - // coerce_types fn should coerce all types to Timestamp(Nanosecond, tz) - // TODO: remove these map_err once the signature is robust enough to guard against this - let start_arr = as_timestamp_nanosecond_array(start).map_err(|_e| { - DataFusionError::Internal(format!( - "Unexpected argument type for {} : {}", - self.name(), - start.data_type() - )) - })?; - let stop_arr = as_timestamp_nanosecond_array(stop).map_err(|_e| { - DataFusionError::Internal(format!( - "Unexpected argument type for {} : {}", - self.name(), - stop.data_type() - )) - })?; - let step_arr = as_interval_mdn_array(step)?; - let start_tz = parse_tz(&start_arr.timezone())?; - let stop_tz = parse_tz(&stop_arr.timezone())?; + let start_tz = parse_tz(&start.timezone())?; + let stop_tz = parse_tz(&stop.timezone())?; // values are timestamps - let values_builder = start_arr + let values_builder = start .timezone() .map_or_else(TimestampNanosecondBuilder::new, |start_tz_str| { TimestampNanosecondBuilder::new().with_timezone(start_tz_str) }); let mut list_builder = ListBuilder::new(values_builder); - for idx in 0..start_arr.len() { - if start_arr.is_null(idx) || stop_arr.is_null(idx) || step_arr.is_null(idx) { + for idx in 0..start.len() { + if start.is_null(idx) || stop.is_null(idx) || step.is_null(idx) { list_builder.append_null(); continue; } - let start = start_arr.value(idx); - let stop = stop_arr.value(idx); - let step = step_arr.value(idx); + let start = start.value(idx); + let stop = stop.value(idx); + let step = step.value(idx); let (months, days, ns) = IntervalMonthDayNanoType::to_parts(step); if months == 0 && days == 0 && ns == 0 { diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 144e3b757adf..5c74f3ddc613 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6949,6 +6949,23 @@ select range(5), ---- [0, 1, 2, 3, 4] [2, 3, 4] [2, 5, 8] [10, 7, 4] [] [] [1, 0, -1, -2, -3, -4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02] [1989-04-01, 1990-04-01, 1991-04-01] [] +# Ensure can coerce from other valid types +query ??????????? +select range(5), + range(2, 5), + range(2, 10, 3), + range(10, 2, -3), + range(arrow_cast(1, 'Int8'), 5, -1), + range(arrow_cast(1, 'Int16'), arrow_cast(-5, 'Int8'), 1), + range(arrow_cast(1, 'Int32'), arrow_cast(-5, 'Int16'), arrow_cast(-1, 'Int8')), + range(DATE '1992-09-01', DATE '1993-03-01', arrow_cast('1 MONTH', 'Interval(YearMonth)')), + range(DATE '1993-02-01', arrow_cast(DATE '1993-01-01', 'Date64'), INTERVAL '-1' DAY), + range(arrow_cast(DATE '1989-04-01', 'Date64'), DATE '1993-03-01', INTERVAL '1' YEAR), + range(arrow_cast(DATE '1993-03-01', 'Date64'), arrow_cast(DATE '1989-04-01', 'Date64'), INTERVAL '1' YEAR) +; +---- +[0, 1, 2, 3, 4] [2, 3, 4] [2, 5, 8] [10, 7, 4] [] [] [1, 0, -1, -2, -3, -4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02] [1989-04-01, 1990-04-01, 1991-04-01] [] + # Test range with zero step query error DataFusion error: Execution error: step can't be 0 for function range\(start \[, stop, step\]\) select range(1, 1, 0); @@ -7114,6 +7131,17 @@ select generate_series('2021-01-01'::timestamp, '2021-01-01T15:00:00'::timestamp ---- [2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00] +# Other timestamp types are coerced to nanosecond +query ? +select generate_series(arrow_cast('2021-01-01'::timestamp, 'Timestamp(Second, None)'), '2021-01-01T15:00:00'::timestamp, INTERVAL '1' HOUR); +---- +[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00] + +query ? +select generate_series('2021-01-01'::timestamp, arrow_cast('2021-01-01T15:00:00'::timestamp, 'Timestamp(Microsecond, None)'), INTERVAL '1' HOUR); +---- +[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00] + query ? select generate_series('2021-01-01T00:00:00EST'::timestamp, '2021-01-01T15:00:00-12:00'::timestamp, INTERVAL '1' HOUR); ---- @@ -7131,9 +7159,18 @@ select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, [2021-01-01T00:00:00-05:00, 2021-01-01T01:29:54.500-05:00, 2021-01-01T02:59:49-05:00, 2021-01-01T04:29:43.500-05:00, 2021-01-01T05:59:38-05:00] ## mixing types for timestamps is not supported -query error DataFusion error: Internal error: Unexpected argument type for generate_series : Date32 +query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), DATE '2021-01-02', INTERVAL '1' HOUR); +## mixing types not allowed even if an argument is null +query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature +select generate_series(TIMESTAMP '1992-09-01', DATE '1993-03-01', NULL); + +query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature +select generate_series(1, '2024-01-01', '2025-01-02'); + +query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature +select generate_series('2024-01-01'::timestamp, '2025-01-02', interval '1 day'); ## should return NULL query ? @@ -7152,11 +7189,6 @@ select generate_series(DATE '1992-09-01', DATE '1993-03-01', NULL); ---- NULL -query ? -select generate_series(TIMESTAMP '1992-09-01', DATE '1993-03-01', NULL); ----- -NULL - query ? select generate_series(NULL, DATE '1993-03-01', INTERVAL '1' YEAR); ---- From 2ee13d660ddebde7c4f6ac80dc5a4ba982cca985 Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Fri, 31 Oct 2025 03:45:59 +0900 Subject: [PATCH 062/157] fix: `DataFrame::select_columns` and `DataFrame::drop_columns` for qualified duplicated field names (#18236) ## Which issue does this PR close? - Closes #18212. ## Rationale for this change `DataFrame::drop_columns` only considers one field for each `name`, it fails to drop columns from dataframe containing duplicated names from different relations. Such as `mark` columns created by multiples `Join::LeftMark`. `DataFrame::select_columns` has the same issue, it fails to select columns with the same name from different relations. ## What changes are included in this PR? Allow `DataFrame::drop_columns` and `DataFrame::select_columns` work with duplicated names from different relations. ## Are these changes tested? Yes. ## Are there any user-facing changes? No. --------- Co-authored-by: Andrew Lamb --- datafusion/core/src/dataframe/mod.rs | 13 ++-- datafusion/core/tests/dataframe/mod.rs | 97 ++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 7 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index b164b050da80..965181b27ca4 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -310,12 +310,12 @@ impl DataFrame { pub fn select_columns(self, columns: &[&str]) -> Result { let fields = columns .iter() - .map(|name| { + .flat_map(|name| { self.plan .schema() - .qualified_field_with_unqualified_name(name) + .qualified_fields_with_unqualified_name(name) }) - .collect::>>()?; + .collect::>(); let expr: Vec = fields .into_iter() .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field)))) @@ -439,13 +439,12 @@ impl DataFrame { pub fn drop_columns(self, columns: &[&str]) -> Result { let fields_to_drop = columns .iter() - .map(|name| { + .flat_map(|name| { self.plan .schema() - .qualified_field_with_unqualified_name(name) + .qualified_fields_with_unqualified_name(name) }) - .filter(|r| r.is_ok()) - .collect::>>()?; + .collect::>(); let expr: Vec = self .plan .schema() diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index c35e3b2eb31b..2aac1768ac63 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -404,6 +404,55 @@ async fn select_with_periods() -> Result<()> { Ok(()) } +#[tokio::test] +async fn select_columns_duplicated_names_from_different_qualifiers() -> Result<()> { + let t1 = test_table_with_name("t1") + .await? + .select_columns(&["c1"])? + .limit(0, Some(3))?; + let t2 = test_table_with_name("t2") + .await? + .select_columns(&["c1"])? + .limit(3, Some(3))?; + let t3 = test_table_with_name("t3") + .await? + .select_columns(&["c1"])? + .limit(6, Some(3))?; + + let join_res = t1 + .join(t2, JoinType::Left, &["t1.c1"], &["t2.c1"], None)? + .join(t3, JoinType::Left, &["t1.c1"], &["t3.c1"], None)?; + assert_snapshot!( + batches_to_sort_string(&join_res.clone().collect().await.unwrap()), + @r" + +----+----+----+ + | c1 | c1 | c1 | + +----+----+----+ + | b | b | | + | b | b | | + | c | | | + | d | | d | + +----+----+----+ + " + ); + + let select_res = join_res.select_columns(&["c1"])?; + assert_snapshot!( + batches_to_sort_string(&select_res.clone().collect().await.unwrap()), + @r" + +----+----+----+ + | c1 | c1 | c1 | + +----+----+----+ + | b | b | | + | b | b | | + | c | | | + | d | | d | + +----+----+----+ + " + ); + Ok(()) +} + #[tokio::test] async fn drop_columns() -> Result<()> { // build plan using Table API @@ -542,6 +591,54 @@ async fn drop_with_periods() -> Result<()> { Ok(()) } +#[tokio::test] +async fn drop_columns_duplicated_names_from_different_qualifiers() -> Result<()> { + let t1 = test_table_with_name("t1") + .await? + .select_columns(&["c1"])? + .limit(0, Some(3))?; + let t2 = test_table_with_name("t2") + .await? + .select_columns(&["c1"])? + .limit(3, Some(3))?; + let t3 = test_table_with_name("t3") + .await? + .select_columns(&["c1"])? + .limit(6, Some(3))?; + + let join_res = t1 + .join(t2, JoinType::LeftMark, &["c1"], &["c1"], None)? + .join(t3, JoinType::LeftMark, &["c1"], &["c1"], None)?; + assert_snapshot!( + batches_to_sort_string(&join_res.clone().collect().await.unwrap()), + @r" + +----+-------+-------+ + | c1 | mark | mark | + +----+-------+-------+ + | b | true | false | + | c | false | false | + | d | false | true | + +----+-------+-------+ + " + ); + + let drop_res = join_res.drop_columns(&["mark"])?; + assert_snapshot!( + batches_to_sort_string(&drop_res.clone().collect().await.unwrap()), + @r" + +----+ + | c1 | + +----+ + | b | + | c | + | d | + +----+ + " + ); + + Ok(()) +} + #[tokio::test] async fn aggregate() -> Result<()> { // build plan using DataFrame API From 3239868903fb09a6b856fbd2f36a447240745425 Mon Sep 17 00:00:00 2001 From: Blake Orth Date: Thu, 30 Oct 2025 12:51:21 -0600 Subject: [PATCH 063/157] Adds Partitioned CSV test to object store access tests (#18370) ## Which issue does this PR close? N/A -- This PR is a supporting effort to: - https://github.com/apache/datafusion/pull/18146 - https://github.com/apache/datafusion/issues/17211 ## Rationale for this change Adding these tests not only improves test coverage/expected output validation, but also gives us a common way to test and talk about object store access for specific query scenarios. ## What changes are included in this PR? - Adds a new test to the object store access integration tests that selects all rows from a set of CSV files under a hive partitioned directory structure - Adds new test harness method to build a partitioned ListingTable backed by CSV data - Adds a new helper method to build a partitioned csv data and register the table ## Are these changes tested? The changes are tests! ## Are there any user-facing changes? No cc @alamb --- .../tests/datasource/object_store_access.rs | 211 +++++++++++++++++- 1 file changed, 208 insertions(+), 3 deletions(-) diff --git a/datafusion/core/tests/datasource/object_store_access.rs b/datafusion/core/tests/datasource/object_store_access.rs index d1592c21472d..f89ca9e04914 100644 --- a/datafusion/core/tests/datasource/object_store_access.rs +++ b/datafusion/core/tests/datasource/object_store_access.rs @@ -28,6 +28,9 @@ use arrow::array::{ArrayRef, Int32Array, RecordBatch}; use async_trait::async_trait; use bytes::Bytes; use datafusion::prelude::{CsvReadOptions, ParquetReadOptions, SessionContext}; +use datafusion_catalog_listing::{ListingOptions, ListingTable, ListingTableConfig}; +use datafusion_datasource::ListingTableUrl; +use datafusion_datasource_csv::CsvFormat; use futures::stream::BoxStream; use insta::assert_snapshot; use object_store::memory::InMemory; @@ -123,6 +126,163 @@ async fn query_multi_csv_file() { ); } +#[tokio::test] +async fn query_partitioned_csv_file() { + let test = Test::new().with_partitioned_csv().await; + assert_snapshot!( + test.query("select * from csv_table_partitioned").await, + @r" + ------- Query Output (6 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00001 | 1e-12 | true | 1 | 10 | 100 | + | 0.00003 | 5e-12 | false | 1 | 10 | 100 | + | 0.00002 | 2e-12 | true | 2 | 20 | 200 | + | 0.00003 | 5e-12 | false | 2 | 20 | 200 | + | 0.00003 | 3e-12 | true | 3 | 30 | 300 | + | 0.00003 | 5e-12 | false | 3 | 30 | 300 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 13 + - LIST (with delimiter) prefix=data + - LIST (with delimiter) prefix=data/a=1 + - LIST (with delimiter) prefix=data/a=2 + - LIST (with delimiter) prefix=data/a=3 + - LIST (with delimiter) prefix=data/a=1/b=10 + - LIST (with delimiter) prefix=data/a=2/b=20 + - LIST (with delimiter) prefix=data/a=3/b=30 + - LIST (with delimiter) prefix=data/a=1/b=10/c=100 + - LIST (with delimiter) prefix=data/a=2/b=20/c=200 + - LIST (with delimiter) prefix=data/a=3/b=30/c=300 + - GET (opts) path=data/a=1/b=10/c=100/file_1.csv + - GET (opts) path=data/a=2/b=20/c=200/file_2.csv + - GET (opts) path=data/a=3/b=30/c=300/file_3.csv + " + ); + + assert_snapshot!( + test.query("select * from csv_table_partitioned WHERE a=2").await, + @r" + ------- Query Output (2 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00002 | 2e-12 | true | 2 | 20 | 200 | + | 0.00003 | 5e-12 | false | 2 | 20 | 200 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 4 + - LIST (with delimiter) prefix=data/a=2 + - LIST (with delimiter) prefix=data/a=2/b=20 + - LIST (with delimiter) prefix=data/a=2/b=20/c=200 + - GET (opts) path=data/a=2/b=20/c=200/file_2.csv + " + ); + + assert_snapshot!( + test.query("select * from csv_table_partitioned WHERE b=20").await, + @r" + ------- Query Output (2 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00002 | 2e-12 | true | 2 | 20 | 200 | + | 0.00003 | 5e-12 | false | 2 | 20 | 200 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 11 + - LIST (with delimiter) prefix=data + - LIST (with delimiter) prefix=data/a=1 + - LIST (with delimiter) prefix=data/a=2 + - LIST (with delimiter) prefix=data/a=3 + - LIST (with delimiter) prefix=data/a=1/b=10 + - LIST (with delimiter) prefix=data/a=2/b=20 + - LIST (with delimiter) prefix=data/a=3/b=30 + - LIST (with delimiter) prefix=data/a=1/b=10/c=100 + - LIST (with delimiter) prefix=data/a=2/b=20/c=200 + - LIST (with delimiter) prefix=data/a=3/b=30/c=300 + - GET (opts) path=data/a=2/b=20/c=200/file_2.csv + " + ); + + assert_snapshot!( + test.query("select * from csv_table_partitioned WHERE c=200").await, + @r" + ------- Query Output (2 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00002 | 2e-12 | true | 2 | 20 | 200 | + | 0.00003 | 5e-12 | false | 2 | 20 | 200 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 11 + - LIST (with delimiter) prefix=data + - LIST (with delimiter) prefix=data/a=1 + - LIST (with delimiter) prefix=data/a=2 + - LIST (with delimiter) prefix=data/a=3 + - LIST (with delimiter) prefix=data/a=1/b=10 + - LIST (with delimiter) prefix=data/a=2/b=20 + - LIST (with delimiter) prefix=data/a=3/b=30 + - LIST (with delimiter) prefix=data/a=1/b=10/c=100 + - LIST (with delimiter) prefix=data/a=2/b=20/c=200 + - LIST (with delimiter) prefix=data/a=3/b=30/c=300 + - GET (opts) path=data/a=2/b=20/c=200/file_2.csv + " + ); + + assert_snapshot!( + test.query("select * from csv_table_partitioned WHERE a=2 AND b=20").await, + @r" + ------- Query Output (2 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00002 | 2e-12 | true | 2 | 20 | 200 | + | 0.00003 | 5e-12 | false | 2 | 20 | 200 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 3 + - LIST (with delimiter) prefix=data/a=2/b=20 + - LIST (with delimiter) prefix=data/a=2/b=20/c=200 + - GET (opts) path=data/a=2/b=20/c=200/file_2.csv + " + ); + + assert_snapshot!( + test.query("select * from csv_table_partitioned WHERE a<2 AND b=10 AND c=100").await, + @r" + ------- Query Output (2 rows) ------- + +---------+-------+-------+---+----+-----+ + | d1 | d2 | d3 | a | b | c | + +---------+-------+-------+---+----+-----+ + | 0.00001 | 1e-12 | true | 1 | 10 | 100 | + | 0.00003 | 5e-12 | false | 1 | 10 | 100 | + +---------+-------+-------+---+----+-----+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 11 + - LIST (with delimiter) prefix=data + - LIST (with delimiter) prefix=data/a=1 + - LIST (with delimiter) prefix=data/a=2 + - LIST (with delimiter) prefix=data/a=3 + - LIST (with delimiter) prefix=data/a=1/b=10 + - LIST (with delimiter) prefix=data/a=2/b=20 + - LIST (with delimiter) prefix=data/a=3/b=30 + - LIST (with delimiter) prefix=data/a=1/b=10/c=100 + - LIST (with delimiter) prefix=data/a=2/b=20/c=200 + - LIST (with delimiter) prefix=data/a=3/b=30/c=300 + - GET (opts) path=data/a=1/b=10/c=100/file_1.csv + " + ); +} + #[tokio::test] async fn create_single_parquet_file_default() { // The default metadata size hint is 512KB @@ -363,7 +523,7 @@ impl Test { self } - /// Register a CSV file at the given path relative to the [`datafusion_test_data`] directory + /// Register a CSV file at the given path async fn register_csv(self, table_name: &str, path: &str) -> Self { let mut options = CsvReadOptions::new(); options.has_header = true; @@ -375,8 +535,30 @@ impl Test { self } - /// Register a Parquet file at the given path relative to the - /// [`datafusion_test_data`] directory + /// Register a partitioned CSV table at the given path + async fn register_partitioned_csv(self, table_name: &str, path: &str) -> Self { + let file_format = Arc::new(CsvFormat::default().with_has_header(true)); + let options = ListingOptions::new(file_format); + + let url = format!("mem://{path}").parse().unwrap(); + let table_url = ListingTableUrl::try_new(url, None).unwrap(); + + let session_state = self.session_context.state(); + let mut config = ListingTableConfig::new(table_url).with_listing_options(options); + config = config + .infer_partitions_from_path(&session_state) + .await + .unwrap(); + config = config.infer_schema(&session_state).await.unwrap(); + + let table = Arc::new(ListingTable::try_new(config).unwrap()); + self.session_context + .register_table(table_name, table) + .unwrap(); + self + } + + /// Register a Parquet file at the given path async fn register_parquet(self, table_name: &str, path: &str) -> Self { let path = format!("mem://{path}"); let mut options: ParquetReadOptions<'_> = ParquetReadOptions::new(); @@ -425,6 +607,29 @@ impl Test { self.register_csv("csv_table", "/data/").await } + /// Register three CSV files in a partitioned directory structure, called + /// `csv_table_partitioned` + async fn with_partitioned_csv(mut self) -> Test { + for i in 1..4 { + // upload CSV data to object store + let csv_data1 = format!( + r#"d1,d2,d3 +0.0000{i},{i}e-12,true +0.00003,5e-12,false +"# + ); + self = self + .with_bytes( + &format!("/data/a={i}/b={}/c={}/file_{i}.csv", i * 10, i * 100,), + csv_data1, + ) + .await; + } + // register table + self.register_partitioned_csv("csv_table_partitioned", "/data/") + .await + } + /// Add a single parquet file that has two columns and two row groups named `parquet_table` /// /// Column "a": Int32 with values 0-100] in row group 1 From a0f1d1df7979a6b0c522e4ed7d5c3783faa3b6e0 Mon Sep 17 00:00:00 2001 From: Nga Tran Date: Thu, 30 Oct 2025 15:01:39 -0400 Subject: [PATCH 064/157] Add reproducer for consecutive RepartitionExec (#18343) Reproducer for https://github.com/apache/datafusion/issues/18341 --- .../test_files/aggregate_repartition.slt | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 datafusion/sqllogictest/test_files/aggregate_repartition.slt diff --git a/datafusion/sqllogictest/test_files/aggregate_repartition.slt b/datafusion/sqllogictest/test_files/aggregate_repartition.slt new file mode 100644 index 000000000000..27602b61e424 --- /dev/null +++ b/datafusion/sqllogictest/test_files/aggregate_repartition.slt @@ -0,0 +1,136 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Reproducer for https://github.com/apache/datafusion/issues/18341 +# Tests for aggregate repartition behavior +# Comparing CSV vs Parquet execution plans for GROUP BY queries + +# Create CSV version of the dimension data +query I +COPY ( + SELECT * FROM (VALUES + ('prod', 100, 'A'), + ('dev', 200, 'B'), + ('test', 150, 'A'), + ('prod', 300, 'C'), + ('dev', 250, 'B') + ) AS t(env, value, category) +) +TO 'test_files/scratch/aggregate_repartition/dim.csv' +STORED AS CSV +OPTIONS ('format.has_header' 'true'); +---- +5 + +# Create Parquet version of the dimension data +query I +COPY ( + SELECT * FROM (VALUES + ('prod', 100, 'A'), + ('dev', 200, 'B'), + ('test', 150, 'A'), + ('prod', 300, 'C'), + ('dev', 250, 'B') + ) AS t(env, value, category) +) +TO 'test_files/scratch/aggregate_repartition/dim.parquet' +STORED AS PARQUET; +---- +5 + +# Create external table for CSV +statement ok +CREATE EXTERNAL TABLE dim_csv +STORED AS CSV +LOCATION 'test_files/scratch/aggregate_repartition/dim.csv' +OPTIONS ('format.has_header' 'true'); + +# Create external table for Parquet +statement ok +CREATE EXTERNAL TABLE dim_parquet +STORED AS PARQUET +LOCATION 'test_files/scratch/aggregate_repartition/dim.parquet'; + +# Test 1: EXPLAIN query for CSV table with GROUP BY +# This plans looks reasonable +query TT +EXPLAIN SELECT env, count(*) FROM dim_csv GROUP BY env; +---- +logical_plan +01)Projection: dim_csv.env, count(Int64(1)) AS count(*) +02)--Aggregate: groupBy=[[dim_csv.env]], aggr=[[count(Int64(1))]] +03)----TableScan: dim_csv projection=[env] +physical_plan +01)ProjectionExec: expr=[env@0 as env, count(Int64(1))@1 as count(*)] +02)--AggregateExec: mode=FinalPartitioned, gby=[env@0 as env], aggr=[count(Int64(1))] +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------RepartitionExec: partitioning=Hash([env@0], 4), input_partitions=4 +05)--------AggregateExec: mode=Partial, gby=[env@0 as env], aggr=[count(Int64(1))] +06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/aggregate_repartition/dim.csv]]}, projection=[env], file_type=csv, has_header=true + +# Test 2: EXPLAIN query for Parquet table with GROUP BY +# This plan differs from the one above and includes two consecutive repartitions — one round-robin and one hash — +# which seems unnecessary. We may want to align it with the previous plan (push the round robin down or remove the round robin), or, if the input file is small, +# avoid repartitioning altogether. A single partition should suffice for a single-step aggregate as the plan after this. + +query TT +EXPLAIN SELECT env, count(*) FROM dim_parquet GROUP BY env; +---- +logical_plan +01)Projection: dim_parquet.env, count(Int64(1)) AS count(*) +02)--Aggregate: groupBy=[[dim_parquet.env]], aggr=[[count(Int64(1))]] +03)----TableScan: dim_parquet projection=[env] +physical_plan +01)ProjectionExec: expr=[env@0 as env, count(Int64(1))@1 as count(*)] +02)--AggregateExec: mode=FinalPartitioned, gby=[env@0 as env], aggr=[count(Int64(1))] +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------RepartitionExec: partitioning=Hash([env@0], 4), input_partitions=4 +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +06)----------AggregateExec: mode=Partial, gby=[env@0 as env], aggr=[count(Int64(1))] +07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/aggregate_repartition/dim.parquet]]}, projection=[env], file_type=parquet + +# Verify the queries actually work and return the same results +query TI rowsort +SELECT env, count(*) FROM dim_csv GROUP BY env; +---- +dev 2 +prod 2 +test 1 + +query TI rowsort +SELECT env, count(*) FROM dim_parquet GROUP BY env; +---- +dev 2 +prod 2 +test 1 + +# Test 3: Change target partitions to 1 to have single-aggregate plan +statement ok +SET datafusion.execution.target_partitions = 1; + +query TT +EXPLAIN SELECT env, count(*) FROM dim_parquet GROUP BY env; +---- +logical_plan +01)Projection: dim_parquet.env, count(Int64(1)) AS count(*) +02)--Aggregate: groupBy=[[dim_parquet.env]], aggr=[[count(Int64(1))]] +03)----TableScan: dim_parquet projection=[env] +physical_plan +01)ProjectionExec: expr=[env@0 as env, count(Int64(1))@1 as count(*)] +02)--AggregateExec: mode=Single, gby=[env@0 as env], aggr=[count(Int64(1))] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/aggregate_repartition/dim.parquet]]}, projection=[env], file_type=parquet From 868078efa4a769f0b11c72b79e7f98fc25f459cc Mon Sep 17 00:00:00 2001 From: Dhanush Date: Fri, 31 Oct 2025 07:44:57 +0530 Subject: [PATCH 065/157] feat(docs): enable navbar (#18324) ## Which issue does this PR close? - Closes #18284. ## What changes are included in this PR? I've enabled the navbar, which is required to use dark-light mode toggle and made following changes in the ui - Removed the existing logo in the side-bar (as it was redundant) - Removed search bar in the side-bar (as it was conflicting with navbar's search widget) image
image --- docs/source/_static/theme_overrides.css | 37 +++++++++++++++++------- docs/source/_templates/docs-sidebar.html | 11 ------- docs/source/_templates/layout.html | 4 --- docs/source/conf.py | 6 ++++ 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css index 01f1a126a76a..9f288a2702e2 100644 --- a/docs/source/_static/theme_overrides.css +++ b/docs/source/_static/theme_overrides.css @@ -29,7 +29,6 @@ --pst-color-h2: var(--color-text-base); /* Use softer blue from bootstrap's default info color */ --pst-color-info: 23, 162, 184; - --pst-header-height: 0px; } code { @@ -40,16 +39,34 @@ code { text-align: center; } +/* Limit both light and dark mode logos in the navbar */ +.logo__image { + height: 32px; + width: auto; + max-height: 2.5rem; +} + /* Display appropriate logo for dark and light mode */ -.light-logo { display: inline; } -.dark-logo { display: none; } - -@media (prefers-color-scheme: dark) { - .light-logo { display: none; } - .dark-logo { - display: inline; - background-color: transparent !important; - } +.light-logo { + display: inline; +} + +.dark-logo { + display: none; +} + +html[data-theme="dark"] .light-logo { + display: none; +} + +html[data-theme="dark"] .dark-logo { + display: inline; + background-color: transparent !important; +} + +/* Align search bar & theme switch right */ +.navbar-header-items__end { + margin-left: auto; } /* Ensure the logo is properly displayed */ diff --git a/docs/source/_templates/docs-sidebar.html b/docs/source/_templates/docs-sidebar.html index 01aabb986050..fa3cd96b1360 100644 --- a/docs/source/_templates/docs-sidebar.html +++ b/docs/source/_templates/docs-sidebar.html @@ -1,14 +1,3 @@ -

- - - - -

-

-

-