From 6d4f4b8b82eac4e8eced2d35326a734df6bcfdc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C4=81rti=C5=86=C5=A1=20Puri=C5=86=C5=A1?= Date: Sat, 25 Apr 2026 09:55:28 +0200 Subject: [PATCH] fix: make_array returns wrong row count for null-array input --- datafusion/core/tests/sql/select.rs | 32 +++++++++ datafusion/functions-nested/src/make_array.rs | 56 ++++++++++++---- .../spark/src/function/array/spark_array.rs | 65 +++++++++++++------ .../test_files/array/make_array.slt | 10 +++ .../test_files/spark/array/array.slt | 10 +++ 5 files changed, 140 insertions(+), 33 deletions(-) diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index 96b911e8db13..2f03bfc451c8 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -174,6 +174,38 @@ async fn prepared_statement_type_coercion() -> Result<()> { Ok(()) } +#[tokio::test] +async fn make_array_null_typed_column_preserves_rows() -> Result<()> { + let ctx = SessionContext::new(); + let batch = RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef), + ("n", Arc::new(NullArray::new(3)) as ArrayRef), + ])?; + ctx.register_batch("test", batch)?; + + let results = ctx + .sql( + "SELECT id, make_array(n) AS arr, array_length(make_array(n)) AS len \ + FROM test \ + ORDER BY id", + ) + .await? + .collect() + .await?; + + assert_snapshot!(batches_to_sort_string(&results), @r" + +----+-----+-----+ + | id | arr | len | + +----+-----+-----+ + | 1 | [] | 1 | + | 2 | [] | 1 | + | 3 | [] | 1 | + +----+-----+-----+ + "); + + Ok(()) +} + #[tokio::test] async fn test_parameter_type_coercion() -> Result<()> { let ctx = SessionContext::new(); diff --git a/datafusion/functions-nested/src/make_array.rs b/datafusion/functions-nested/src/make_array.rs index 32af5df2c601..a4053be8f60c 100644 --- a/datafusion/functions-nested/src/make_array.rs +++ b/datafusion/functions-nested/src/make_array.rs @@ -26,8 +26,7 @@ use arrow::array::{ NullArray, OffsetSizeTrait, new_null_array, }; use arrow::buffer::OffsetBuffer; -use arrow::datatypes::DataType; -use arrow::datatypes::{DataType::Null, Field}; +use arrow::datatypes::{DataType, Field}; use datafusion_common::utils::SingleRowListArrayBuilder; use datafusion_common::{Result, plan_err}; use datafusion_expr::binary::{ @@ -96,7 +95,7 @@ impl ScalarUDFImpl for MakeArray { fn return_type(&self, arg_types: &[DataType]) -> Result { let element_type = if arg_types.is_empty() { - Null + DataType::Null } else { // At this point, all the type in array should be coerced to the same one. arg_types[0].to_owned() @@ -130,20 +129,23 @@ impl ScalarUDFImpl for MakeArray { /// Constructs an array using the input `data` as `ArrayRef`. /// Returns a reference-counted `Array` instance result. pub(crate) fn make_array_inner(arrays: &[ArrayRef]) -> Result { - let data_type = arrays.iter().find_map(|arg| { - let arg_type = arg.data_type(); - (!arg_type.is_null()).then_some(arg_type) - }); - - let data_type = data_type.unwrap_or(&Null); - if data_type.is_null() { - // Either an empty array or all nulls: - let length = arrays.iter().map(|a| a.len()).sum(); - let array = new_null_array(&Null, length); + // Zero arguments are the only case that should build a scalar empty list. + if arrays.is_empty() { + let array = new_null_array(&DataType::Null, 0); Ok(Arc::new( SingleRowListArrayBuilder::new(array).build_list_array(), )) } else { + // All-null inputs still need to flow through `array_array()` so rows + // are built per input row instead of collapsing to one value. + let data_type = arrays + .iter() + .find_map(|arg| { + let arg_type = arg.data_type(); + (!arg_type.is_null()).then_some(arg_type) + }) + .unwrap_or(&DataType::Null); + array_array::(arrays, data_type.clone(), Field::LIST_FIELD_DEFAULT_NAME) } } @@ -256,3 +258,31 @@ pub fn coerce_types_inner(arg_types: &[DataType], name: &str) -> Result().unwrap(); + + assert_eq!(list.len(), 3); + assert_eq!(list.value_type(), DataType::Null); + assert_eq!(list.values().len(), 6); + + for row in 0..list.len() { + assert_eq!(list.value_length(row), 2); + let values = list.value(row); + assert_eq!(values.len(), 2); + assert_eq!(values.logical_null_count(), 2); + } + } +} diff --git a/datafusion/spark/src/function/array/spark_array.rs b/datafusion/spark/src/function/array/spark_array.rs index d6d4e7f0ab9f..d4274a145d84 100644 --- a/datafusion/spark/src/function/array/spark_array.rs +++ b/datafusion/spark/src/function/array/spark_array.rs @@ -110,28 +110,53 @@ impl ScalarUDFImpl for SparkArray { /// Constructs an array using the input `data` as `ArrayRef`. /// Returns a reference-counted `Array` instance result. pub fn make_array_inner(arrays: &[ArrayRef]) -> Result { - let mut data_type = DataType::Null; - for arg in arrays { - let arg_data_type = arg.data_type(); - if !arg_data_type.equals_datatype(&DataType::Null) { - data_type = arg_data_type.clone(); - break; - } + // Zero arguments are the only case that should build a scalar empty list. + if arrays.is_empty() { + let array = new_null_array(&DataType::Null, 0); + Ok(Arc::new( + SingleRowListArrayBuilder::new(array) + .with_field_name(Some(ARRAY_FIELD_DEFAULT_NAME.to_string())) + .build_list_array(), + )) + } else { + // All-null inputs still need to flow through `array_array()` so rows + // are built per input row instead of collapsing to one value. + let data_type = arrays + .iter() + .find_map(|arg| { + let arg_type = arg.data_type(); + (!arg_type.is_null()).then_some(arg_type) + }) + .unwrap_or(&DataType::Null); + array_array::(arrays, data_type.clone(), ARRAY_FIELD_DEFAULT_NAME) } +} - match data_type { - // Either an empty array or all nulls: - DataType::Null => { - let length = arrays.iter().map(|a| a.len()).sum(); - // By default Int32 - let array = new_null_array(&DataType::Null, length); - Ok(Arc::new( - SingleRowListArrayBuilder::new(array) - .with_nullable(true) - .with_field_name(Some(ARRAY_FIELD_DEFAULT_NAME.to_string())) - .build_list_array(), - )) +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{ListArray, NullArray}; + use arrow::datatypes::DataType; + + #[test] + fn spark_array_inner_all_null_arrays_preserves_row_count_and_width() { + let inputs = vec![ + Arc::new(NullArray::new(3)) as ArrayRef, + Arc::new(NullArray::new(3)) as ArrayRef, + ]; + + let result = make_array_inner(&inputs).unwrap(); + let list = result.as_any().downcast_ref::().unwrap(); + + assert_eq!(list.len(), 3); + assert_eq!(list.value_type(), DataType::Null); + assert_eq!(list.values().len(), 6); + + for row in 0..list.len() { + assert_eq!(list.value_length(row), 2); + let values = list.value(row); + assert_eq!(values.len(), 2); + assert_eq!(values.logical_null_count(), 2); } - _ => array_array::(arrays, data_type, ARRAY_FIELD_DEFAULT_NAME), } } diff --git a/datafusion/sqllogictest/test_files/array/make_array.slt b/datafusion/sqllogictest/test_files/array/make_array.slt index fa91dcbaabc7..2596287fcabf 100644 --- a/datafusion/sqllogictest/test_files/array/make_array.slt +++ b/datafusion/sqllogictest/test_files/array/make_array.slt @@ -88,6 +88,16 @@ select make_array(NULL), make_array(NULL, NULL, NULL), make_array(make_array(NUL ---- [NULL] [NULL, NULL, NULL] [[NULL, NULL], [NULL, NULL]] +# make_array with null-array parameter preserves input rows and list-null value +query I?I +select id, make_array(n), array_length(make_array(n)) +from (values (1, NULL), (2, NULL), (3, NULL)) as t(id, n) +order by id; +---- +1 [NULL] 1 +2 [NULL] 1 +3 [NULL] 1 + # make_array with 1 columns query ??? select make_array(a), make_array(d), make_array(e) from values; diff --git a/datafusion/sqllogictest/test_files/spark/array/array.slt b/datafusion/sqllogictest/test_files/spark/array/array.slt index 79dca1c10a7d..03246e53b32a 100644 --- a/datafusion/sqllogictest/test_files/spark/array/array.slt +++ b/datafusion/sqllogictest/test_files/spark/array/array.slt @@ -43,6 +43,16 @@ SELECT array(null); ---- [NULL] +# array with null-array parameter preserves input rows and list-null value +query I?I +SELECT id, array(n), size(array(n)) +FROM (VALUES (1, NULL), (2, NULL), (3, NULL)) AS t(id, n) +ORDER BY id; +---- +1 [NULL] 1 +2 [NULL] 1 +3 [NULL] 1 + query ? SELECT array(1, NULL, 3);