From 5f7fa05f37bdf1b2513a736e44b40868244aeb74 Mon Sep 17 00:00:00 2001 From: Harrison Crosse Date: Thu, 16 Apr 2026 18:22:51 -0400 Subject: [PATCH 1/3] fix: rewrite concat(array, ...) to array_concat --- .../core/src/execution/session_state.rs | 5 + .../src/execution/session_state_defaults.rs | 11 +++ .../functions-nested/src/concat_rewrite.rs | 93 +++++++++++++++++++ datafusion/functions-nested/src/lib.rs | 5 + .../test_files/array/array_concat.slt | 73 +++++++++++++++ .../sqllogictest/test_files/explain.slt | 2 + 6 files changed, 189 insertions(+) create mode 100644 datafusion/functions-nested/src/concat_rewrite.rs diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index a5749e70ceaa..bea17b4b5944 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -1156,6 +1156,11 @@ impl SessionStateBuilder { .get_or_insert_with(Vec::new) .extend(SessionStateDefaults::default_expr_planners()); + let analyzer = self.analyzer.get_or_insert_with(Analyzer::default); + for rewrite in SessionStateDefaults::default_function_rewrites() { + analyzer.add_function_rewrite(rewrite); + } + self.scalar_functions .get_or_insert_with(Vec::new) .extend(SessionStateDefaults::default_scalar_functions()); diff --git a/datafusion/core/src/execution/session_state_defaults.rs b/datafusion/core/src/execution/session_state_defaults.rs index 8ef041e5bf64..e27441bfc5a0 100644 --- a/datafusion/core/src/execution/session_state_defaults.rs +++ b/datafusion/core/src/execution/session_state_defaults.rs @@ -35,6 +35,7 @@ use datafusion_catalog::{MemoryCatalogProvider, MemorySchemaProvider}; use datafusion_execution::config::SessionConfig; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_execution::runtime_env::RuntimeEnv; +use datafusion_expr::expr_rewriter::FunctionRewrite; use datafusion_expr::planner::ExprPlanner; use datafusion_expr::registry::ExtensionTypeRegistrationRef; use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF}; @@ -102,6 +103,16 @@ impl SessionStateDefaults { expr_planners } + /// returns the list of default [`FunctionRewrite`]s installed on the analyzer. + pub fn default_function_rewrites() -> Vec> { + let rewrites: Vec> = vec![ + #[cfg(feature = "nested_expressions")] + Arc::new(functions_nested::concat_rewrite::ConcatArrayRewrite), + ]; + + rewrites + } + /// returns the list of default [`ScalarUDF`]s pub fn default_scalar_functions() -> Vec> { #[cfg_attr(not(feature = "nested_expressions"), expect(unused_mut))] diff --git a/datafusion/functions-nested/src/concat_rewrite.rs b/datafusion/functions-nested/src/concat_rewrite.rs new file mode 100644 index 000000000000..e54770b69aa8 --- /dev/null +++ b/datafusion/functions-nested/src/concat_rewrite.rs @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`ConcatArrayRewrite`] rewrites `concat(array, ...)` to `array_concat(array, ...)`. + +use std::any::Any; + +use arrow::datatypes::DataType; +use datafusion_common::config::ConfigOptions; +use datafusion_common::tree_node::Transformed; +use datafusion_common::{DFSchema, Result, plan_err}; +use datafusion_expr::Expr; +use datafusion_expr::ExprSchemable; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::expr_rewriter::FunctionRewrite; +use datafusion_functions::string::concat::ConcatFunc; + +use crate::concat::array_concat_udf; + +/// [`FunctionRewrite`] that turns `concat(array, ...)` into +/// `array_concat(array, ...)` at the analyzer phase. +/// +/// The string `concat` UDF only handles string and binary inputs. When it +/// receives array arguments it falls through to a debug-only code path and +/// produces surprising output. Rewriting at the analyzer phase means every +/// logical plan gets the corrected behavior. +/// +/// `concat` calls with only non-array arguments are left unchanged. +/// Mixed array and non-array arguments are rejected with a plan error. +#[derive(Debug, Default)] +pub struct ConcatArrayRewrite; + +impl FunctionRewrite for ConcatArrayRewrite { + fn name(&self) -> &str { + "concat_array_rewrite" + } + + fn rewrite( + &self, + expr: Expr, + schema: &DFSchema, + _config: &ConfigOptions, + ) -> Result> { + let Expr::ScalarFunction(ScalarFunction { func, args }) = &expr else { + return Ok(Transformed::no(expr)); + }; + if !(func.inner().as_ref() as &dyn Any).is::() { + return Ok(Transformed::no(expr)); + } + + let mut any_list = false; + let mut any_non_list = false; + for arg in args { + match arg.get_type(schema)? { + DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) => any_list = true, + DataType::Null => {} + _ => any_non_list = true, + } + } + + if !any_list { + return Ok(Transformed::no(expr)); + } + if any_non_list { + return plan_err!( + "Cannot mix array and non-array arguments in concat function" + ); + } + + let Expr::ScalarFunction(ScalarFunction { args, .. }) = expr else { + unreachable!("already matched above") + }; + Ok(Transformed::yes(Expr::ScalarFunction( + ScalarFunction::new_udf(array_concat_udf(), args), + ))) + } +} diff --git a/datafusion/functions-nested/src/lib.rs b/datafusion/functions-nested/src/lib.rs index 99c33fb64039..73798798c044 100644 --- a/datafusion/functions-nested/src/lib.rs +++ b/datafusion/functions-nested/src/lib.rs @@ -42,6 +42,7 @@ pub mod array_has; pub mod arrays_zip; pub mod cardinality; pub mod concat; +pub mod concat_rewrite; pub mod dimension; pub mod distance; pub mod empty; @@ -192,6 +193,10 @@ pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { Ok(()) as Result<()> })?; + // Register the analyzer rewrite that turns `concat(array, ...)` into + // `array_concat(...)`. + registry.register_function_rewrite(Arc::new(concat_rewrite::ConcatArrayRewrite))?; + Ok(()) } diff --git a/datafusion/sqllogictest/test_files/array/array_concat.slt b/datafusion/sqllogictest/test_files/array/array_concat.slt index 0f847811615c..6137efd3385a 100644 --- a/datafusion/sqllogictest/test_files/array/array_concat.slt +++ b/datafusion/sqllogictest/test_files/array/array_concat.slt @@ -388,4 +388,77 @@ select array_concat(make_array(column3), column1, column2) from arrays_values_v2 [NULL] +## concat() delegates to array_concat when all arguments are arrays. +## An analyzer-phase FunctionRewrite rewrites the call before execution, so +## the string concat path never sees array inputs. + +query ? +select concat(make_array(1, 2, 3), make_array(4, 5)); +---- +[1, 2, 3, 4, 5] + +query ? +select concat(make_array(1, 2), make_array(3, 4), make_array(5, 6)); +---- +[1, 2, 3, 4, 5, 6] + +query ? +select concat(make_array(1, NULL, 3), make_array(4)); +---- +[1, NULL, 3, 4] + +query ? +select concat(make_array('a', 'b'), make_array('c', 'd')); +---- +[a, b, c, d] + +query ? +select concat(NULL::integer[], make_array(1, 2)); +---- +[1, 2] + +query ? +select concat(make_array(1, 2), NULL::integer[]); +---- +[1, 2] + +query ? +select concat(column1, column2) from arrays_values_v2; +---- +[NULL, 2, 3, 4, 5, NULL] +[7, NULL, 8] +[9, NULL, 10] +[NULL, 1, NULL, 21] +[11, 12] +NULL + +query ? +select concat( + arrow_cast(['1', '2'], 'LargeList(Utf8)'), + arrow_cast(['3'], 'LargeList(Utf8)') +); +---- +[1, 2, 3] + +query ? +select concat( + arrow_cast(['1', '2'], 'FixedSizeList(2, Utf8)'), + arrow_cast(['3'], 'FixedSizeList(1, Utf8)') +); +---- +[1, 2, 3] + +query ? +select concat(NULL::integer[], NULL::integer[]); +---- +NULL + +# Mixed array + non-array arguments are rejected at plan time. +statement error Cannot mix array and non-array arguments in concat function +select concat(make_array(1), 'x'); + +statement error Cannot mix array and non-array arguments in concat function +select concat('x', make_array(1)); + + include ./cleanup.slt.part diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 2e8a65385541..cd51e15c1c9f 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -173,6 +173,7 @@ EXPLAIN VERBOSE SELECT a, b, c FROM simple_explain_test initial_logical_plan 01)Projection: simple_explain_test.a, simple_explain_test.b, simple_explain_test.c 02)--TableScan: simple_explain_test +logical_plan after apply_function_rewrites SAME TEXT AS ABOVE logical_plan after resolve_grouping_function SAME TEXT AS ABOVE logical_plan after type_coercion SAME TEXT AS ABOVE analyzed_logical_plan SAME TEXT AS ABOVE @@ -548,6 +549,7 @@ EXPLAIN VERBOSE SELECT a, b, c FROM simple_explain_test initial_logical_plan 01)Projection: simple_explain_test.a, simple_explain_test.b, simple_explain_test.c 02)--TableScan: simple_explain_test +logical_plan after apply_function_rewrites SAME TEXT AS ABOVE logical_plan after resolve_grouping_function SAME TEXT AS ABOVE logical_plan after type_coercion SAME TEXT AS ABOVE analyzed_logical_plan SAME TEXT AS ABOVE From d4c6f67e9213159e06321a6be231e414882b3165 Mon Sep 17 00:00:00 2001 From: Harrison Crosse Date: Thu, 16 Apr 2026 22:32:16 -0400 Subject: [PATCH 2/3] chore: drop redundant rationale from ConcatArrayRewrite doc comment --- datafusion/functions-nested/src/concat_rewrite.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/datafusion/functions-nested/src/concat_rewrite.rs b/datafusion/functions-nested/src/concat_rewrite.rs index e54770b69aa8..a8531cb62974 100644 --- a/datafusion/functions-nested/src/concat_rewrite.rs +++ b/datafusion/functions-nested/src/concat_rewrite.rs @@ -34,11 +34,6 @@ use crate::concat::array_concat_udf; /// [`FunctionRewrite`] that turns `concat(array, ...)` into /// `array_concat(array, ...)` at the analyzer phase. /// -/// The string `concat` UDF only handles string and binary inputs. When it -/// receives array arguments it falls through to a debug-only code path and -/// produces surprising output. Rewriting at the analyzer phase means every -/// logical plan gets the corrected behavior. -/// /// `concat` calls with only non-array arguments are left unchanged. /// Mixed array and non-array arguments are rejected with a plan error. #[derive(Debug, Default)] From e8a4464686ee25de04abe5e2e7d5660499dcd9d5 Mon Sep 17 00:00:00 2001 From: Harrison Crosse Date: Fri, 17 Apr 2026 11:10:43 -0400 Subject: [PATCH 3/3] chore: address review feedback - use array_concat expr_fn instead of hand-constructing ScalarFunction - add FSL + List SLT case to cover mixed list-variant coercion --- datafusion/functions-nested/src/concat_rewrite.rs | 6 ++---- .../sqllogictest/test_files/array/array_concat.slt | 9 +++++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/datafusion/functions-nested/src/concat_rewrite.rs b/datafusion/functions-nested/src/concat_rewrite.rs index a8531cb62974..be64d98a8773 100644 --- a/datafusion/functions-nested/src/concat_rewrite.rs +++ b/datafusion/functions-nested/src/concat_rewrite.rs @@ -29,7 +29,7 @@ use datafusion_expr::expr::ScalarFunction; use datafusion_expr::expr_rewriter::FunctionRewrite; use datafusion_functions::string::concat::ConcatFunc; -use crate::concat::array_concat_udf; +use crate::concat::array_concat; /// [`FunctionRewrite`] that turns `concat(array, ...)` into /// `array_concat(array, ...)` at the analyzer phase. @@ -81,8 +81,6 @@ impl FunctionRewrite for ConcatArrayRewrite { let Expr::ScalarFunction(ScalarFunction { args, .. }) = expr else { unreachable!("already matched above") }; - Ok(Transformed::yes(Expr::ScalarFunction( - ScalarFunction::new_udf(array_concat_udf(), args), - ))) + Ok(Transformed::yes(array_concat(args))) } } diff --git a/datafusion/sqllogictest/test_files/array/array_concat.slt b/datafusion/sqllogictest/test_files/array/array_concat.slt index 6137efd3385a..4b38c17ca924 100644 --- a/datafusion/sqllogictest/test_files/array/array_concat.slt +++ b/datafusion/sqllogictest/test_files/array/array_concat.slt @@ -453,6 +453,15 @@ select concat(NULL::integer[], NULL::integer[]); ---- NULL +# Mixed list variants are coerced by array_concat's own coerce_types +# rules (same result as calling array_concat directly). +query ?T +select + concat(arrow_cast([1, 2], 'FixedSizeList(2, Int64)'), make_array(3, 4)) as v, + arrow_typeof(concat(arrow_cast([1, 2], 'FixedSizeList(2, Int64)'), make_array(3, 4))) as t; +---- +[1, 2, 3, 4] List(Int64) + # Mixed array + non-array arguments are rejected at plan time. statement error Cannot mix array and non-array arguments in concat function select concat(make_array(1), 'x');