From 8a4821bc6f911984a36f8d62bed7b0a3a6f2796d Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 16 Apr 2026 18:30:13 -0400 Subject: [PATCH 1/6] refactor: Rename concat-specific string builders Rename the three builders in `datafusion/functions/src/strings.rs` to make their special-purpose nature explicit: - `StringArrayBuilder` -> `ConcatStringBuilder` - `LargeStringArrayBuilder` -> `ConcatLargeStringBuilder` - `StringViewArrayBuilder` -> `ConcatStringViewBuilder` These builders are used only by `concat` and `concat_ws`; their `write::(col: &ColumnarValueRef, i)` + `append_offset()` API is shaped around accumulating multiple input fragments into one output row and is not a general-purpose string-builder replacement. The `Concat` prefix makes that clear and frees the previous names for a follow-up change that introduces general-purpose bulk-nulls builders with a simpler `append_value(&str)` API. --- datafusion/functions/src/string/concat.rs | 9 ++-- datafusion/functions/src/string/concat_ws.rs | 9 ++-- datafusion/functions/src/strings.rs | 48 ++++++++++++-------- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index e22c3b80dacb4..b10db23472c99 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -22,7 +22,8 @@ use std::sync::Arc; use crate::string::concat; use crate::strings::{ - ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder, + ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder, + ConcatStringViewBuilder, }; use datafusion_common::cast::{as_binary_array, as_string_array, as_string_view_array}; use datafusion_common::{ @@ -242,7 +243,7 @@ impl ScalarUDFImpl for ConcatFunc { match return_datatype { DataType::Utf8 => { - let mut builder = StringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringBuilder::with_capacity(len, data_size); for i in 0..len { columns .iter() @@ -254,7 +255,7 @@ impl ScalarUDFImpl for ConcatFunc { Ok(ColumnarValue::Array(Arc::new(string_array))) } DataType::Utf8View => { - let mut builder = StringViewArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size); for i in 0..len { columns .iter() @@ -266,7 +267,7 @@ impl ScalarUDFImpl for ConcatFunc { Ok(ColumnarValue::Array(Arc::new(string_array))) } DataType::LargeUtf8 => { - let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size); for i in 0..len { columns .iter() diff --git a/datafusion/functions/src/string/concat_ws.rs b/datafusion/functions/src/string/concat_ws.rs index fc4cc6e43b160..2c2d4bd42165b 100644 --- a/datafusion/functions/src/string/concat_ws.rs +++ b/datafusion/functions/src/string/concat_ws.rs @@ -24,7 +24,8 @@ use crate::string::concat; use crate::string::concat::simplify_concat; use crate::string::concat_ws; use crate::strings::{ - ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder, + ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder, + ConcatStringViewBuilder, }; use datafusion_common::cast::{ as_large_string_array, as_string_array, as_string_view_array, @@ -311,7 +312,7 @@ impl ScalarUDFImpl for ConcatWsFunc { match return_datatype { DataType::Utf8View => { - let mut builder = StringViewArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size); for i in 0..len { if !sep.is_valid(i) { builder.append_offset()?; @@ -332,7 +333,7 @@ impl ScalarUDFImpl for ConcatWsFunc { Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?))) } DataType::LargeUtf8 => { - let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size); for i in 0..len { if !sep.is_valid(i) { builder.append_offset()?; @@ -353,7 +354,7 @@ impl ScalarUDFImpl for ConcatWsFunc { Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?))) } _ => { - let mut builder = StringArrayBuilder::with_capacity(len, data_size); + let mut builder = ConcatStringBuilder::with_capacity(len, data_size); for i in 0..len { if !sep.is_valid(i) { builder.append_offset()?; diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index bcfb3d666c8ba..b200d1a11c438 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -26,17 +26,22 @@ use arrow::array::{ use arrow::buffer::{Buffer, MutableBuffer, NullBuffer, ScalarBuffer}; use arrow::datatypes::DataType; -/// Optimized version of the StringBuilder in Arrow that: -/// 1. Precalculating the expected length of the result, avoiding reallocations. -/// 2. Avoids creating / incrementally creating a `NullBufferBuilder` -pub struct StringArrayBuilder { +/// Builder used by `concat`/`concat_ws` to assemble a [`StringArray`] one row +/// at a time from multiple input columns. +/// +/// Each row is written via repeated [`Self::write`] calls (one per input +/// fragment) followed by a single [`Self::append_offset`] to commit the row. +/// The output null buffer is computed in bulk by the caller and supplied to +/// [`Self::finish`], avoiding per-row [`arrow::array::builder::NullBufferBuilder`] +/// work. +pub struct ConcatStringBuilder { offsets_buffer: MutableBuffer, value_buffer: MutableBuffer, /// If true, a safety check is required during the `finish` call tainted: bool, } -impl StringArrayBuilder { +impl ConcatStringBuilder { pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { let capacity = item_capacity .checked_add(1) @@ -151,13 +156,17 @@ impl StringArrayBuilder { } } -/// Optimized version of Arrow's [`StringViewBuilder`]. Rather than adding NULLs -/// on a row-by-row basis, the caller should provide nulls when calling -/// [`finish`](Self::finish). This allows callers to compute nulls more -/// efficiently (e.g., via bulk bitmap operations). +/// Builder used by `concat`/`concat_ws` to assemble a [`StringViewArray`] one +/// row at a time from multiple input columns. +/// +/// Each row is written via repeated [`Self::write`] calls (one per input +/// fragment) followed by a single [`Self::append_offset`] to commit the row +/// as a single string view. The output null buffer is supplied by the caller +/// at [`Self::finish`] time, avoiding per-row +/// [`arrow::array::builder::NullBufferBuilder`] work. /// -/// [`StringViewBuilder`]: arrow::array::StringViewBuilder -pub struct StringViewArrayBuilder { +/// [`StringViewArray`]: arrow::array::StringViewArray +pub struct ConcatStringViewBuilder { views: Vec, data: Vec, block: Vec, @@ -165,7 +174,7 @@ pub struct StringViewArrayBuilder { tainted: bool, } -impl StringViewArrayBuilder { +impl ConcatStringViewBuilder { pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { Self { views: Vec::with_capacity(item_capacity), @@ -286,14 +295,17 @@ impl StringViewArrayBuilder { } } -pub struct LargeStringArrayBuilder { +/// Builder used by `concat`/`concat_ws` to assemble a [`LargeStringArray`] one +/// row at a time from multiple input columns. See [`ConcatStringBuilder`] for +/// details on the row-composition contract. +pub struct ConcatLargeStringBuilder { offsets_buffer: MutableBuffer, value_buffer: MutableBuffer, /// If true, a safety check is required during the `finish` call tainted: bool, } -impl LargeStringArrayBuilder { +impl ConcatLargeStringBuilder { pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { let capacity = item_capacity .checked_add(1) @@ -497,13 +509,13 @@ mod tests { #[test] #[should_panic(expected = "capacity integer overflow")] - fn test_overflow_string_array_builder() { - let _builder = StringArrayBuilder::with_capacity(usize::MAX, usize::MAX); + fn test_overflow_concat_string_builder() { + let _builder = ConcatStringBuilder::with_capacity(usize::MAX, usize::MAX); } #[test] #[should_panic(expected = "capacity integer overflow")] - fn test_overflow_large_string_array_builder() { - let _builder = LargeStringArrayBuilder::with_capacity(usize::MAX, usize::MAX); + fn test_overflow_concat_large_string_builder() { + let _builder = ConcatLargeStringBuilder::with_capacity(usize::MAX, usize::MAX); } } From 75b2bfb948fc500aafae861b74bef172ee7609a6 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 16 Apr 2026 18:31:09 -0400 Subject: [PATCH 2/6] perf: Add bulk-nulls string array builders, use in `lower`/`upper` Introduce three new string array builders with bulk null tracking: - `StringArrayBuilder` (Utf8) - `LargeStringArrayBuilder` (LargeUtf8) - `StringViewArrayBuilder` (Utf8View) Each builder has the following API: - append_value(&str) -- add a non-NULL value (row) - append_placeholder() -- add a NULL row placeholder - finish(Option) -- finish the build, specify NULLs These are the counterpart of Arrow's `GenericStringBuilder` / `StringViewBuilder` but it skips per-row NULL buffer maintenance, which lets callers compute the NULL buffer in bulk when possible. This PR also switches `case_conversion` to use the new APIs, which is used to implement `lower`, `upper`, and the Spark equivalents. This improves `lower` / `upper` performance by 3-15% on microbenchmarks. More UDFs (~10) will be converted to use this API in future PRs. --- datafusion/functions/src/string/common.rs | 63 ++-- datafusion/functions/src/strings.rs | 365 +++++++++++++++++++++- 2 files changed, 400 insertions(+), 28 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index bda6b429ac772..4ec02db0ba849 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -19,10 +19,10 @@ use std::sync::Arc; -use crate::strings::append_view; +use crate::strings::{GenericStringArrayBuilder, StringViewArrayBuilder, append_view}; use arrow::array::{ - Array, ArrayRef, GenericStringArray, GenericStringBuilder, NullBufferBuilder, - OffsetSizeTrait, StringViewArray, StringViewBuilder, new_null_array, + Array, ArrayRef, GenericStringArray, NullBufferBuilder, OffsetSizeTrait, + StringViewArray, new_null_array, }; use arrow::buffer::{Buffer, ScalarBuffer}; use arrow::datatypes::DataType; @@ -349,18 +349,29 @@ where >(array, op)?)), DataType::Utf8View => { let string_array = as_string_view_array(array)?; - let mut string_builder = - StringViewBuilder::with_capacity(string_array.len()); - - for str in string_array.iter() { - if let Some(str) = str { - string_builder.append_value(op(str)); - } else { - string_builder.append_null(); + let item_len = string_array.len(); + let nulls = string_array.nulls().cloned(); + let mut builder = StringViewArrayBuilder::with_capacity(item_len); + + if let Some(ref n) = nulls { + for i in 0..item_len { + if n.is_null(i) { + builder.append_placeholder(); + } else { + // SAFETY: `n.is_null(i)` was false in the branch above. + let s = unsafe { string_array.value_unchecked(i) }; + builder.append_value(&op(s)); + } + } + } else { + for i in 0..item_len { + // SAFETY: no null buffer means every index is valid. + let s = unsafe { string_array.value_unchecked(i) }; + builder.append_value(&op(s)); } } - Ok(ColumnarValue::Array(Arc::new(string_builder.finish()))) + Ok(ColumnarValue::Array(Arc::new(builder.finish(nulls)?))) } other => exec_err!("Unsupported data type {other:?} for function {name}"), }, @@ -399,18 +410,28 @@ where // Values contain non-ASCII. let item_len = string_array.len(); - let capacity = string_array.value_data().len() + PRE_ALLOC_BYTES; - let mut builder = GenericStringBuilder::::with_capacity(item_len, capacity); + let capacity = value_data.len() + PRE_ALLOC_BYTES; + let nulls = string_array.nulls().cloned(); + let mut builder = GenericStringArrayBuilder::::with_capacity(item_len, capacity); - if string_array.null_count() == 0 { - let iter = - (0..item_len).map(|i| Some(op(unsafe { string_array.value_unchecked(i) }))); - builder.extend(iter); + if let Some(ref n) = nulls { + for i in 0..item_len { + if n.is_null(i) { + builder.append_placeholder(); + } else { + // SAFETY: `n.is_null(i)` was false in the branch above. + let s = unsafe { string_array.value_unchecked(i) }; + builder.append_value(&op(s)); + } + } } else { - let iter = string_array.iter().map(|string| string.map(&op)); - builder.extend(iter); + for i in 0..item_len { + // SAFETY: no null buffer means every index is valid. + let s = unsafe { string_array.value_unchecked(i) }; + builder.append_value(&op(s)); + } } - Ok(Arc::new(builder.finish())) + Ok(Arc::new(builder.finish(nulls)?)) } /// All values of string_array are ASCII, and when converting case, there is no changes in the byte diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index b200d1a11c438..2dd3adcb939bb 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -15,13 +15,14 @@ // specific language governing permissions and limitations // under the License. +use std::marker::PhantomData; use std::mem::size_of; use datafusion_common::{Result, exec_datafusion_err, internal_err}; use arrow::array::{ - Array, ArrayAccessor, ArrayDataBuilder, BinaryArray, ByteView, LargeStringArray, - StringArray, StringViewArray, make_view, + Array, ArrayAccessor, ArrayDataBuilder, BinaryArray, ByteView, GenericStringArray, + LargeStringArray, OffsetSizeTrait, StringArray, StringViewArray, make_view, }; use arrow::buffer::{Buffer, MutableBuffer, NullBuffer, ScalarBuffer}; use arrow::datatypes::DataType; @@ -34,7 +35,10 @@ use arrow::datatypes::DataType; /// The output null buffer is computed in bulk by the caller and supplied to /// [`Self::finish`], avoiding per-row [`arrow::array::builder::NullBufferBuilder`] /// work. -pub struct ConcatStringBuilder { +/// +/// For the common "produce one `&str` per row" pattern, prefer +/// [`GenericStringArrayBuilder`][GenericStringArrayBuilder] instead. +pub(crate) struct ConcatStringBuilder { offsets_buffer: MutableBuffer, value_buffer: MutableBuffer, /// If true, a safety check is required during the `finish` call @@ -165,8 +169,11 @@ impl ConcatStringBuilder { /// at [`Self::finish`] time, avoiding per-row /// [`arrow::array::builder::NullBufferBuilder`] work. /// +/// For the common "produce one `&str` per row" pattern, prefer +/// [`StringViewArrayBuilder`] instead. +/// /// [`StringViewArray`]: arrow::array::StringViewArray -pub struct ConcatStringViewBuilder { +pub(crate) struct ConcatStringViewBuilder { views: Vec, data: Vec, block: Vec, @@ -298,7 +305,10 @@ impl ConcatStringViewBuilder { /// Builder used by `concat`/`concat_ws` to assemble a [`LargeStringArray`] one /// row at a time from multiple input columns. See [`ConcatStringBuilder`] for /// details on the row-composition contract. -pub struct ConcatLargeStringBuilder { +/// +/// For the common "produce one `&str` per row" pattern, prefer +/// [`GenericStringArrayBuilder`][GenericStringArrayBuilder] instead. +pub(crate) struct ConcatLargeStringBuilder { offsets_buffer: MutableBuffer, value_buffer: MutableBuffer, /// If true, a safety check is required during the `finish` call @@ -420,6 +430,222 @@ impl ConcatLargeStringBuilder { } } +// ---------------------------------------------------------------------------- +// Bulk-nulls builders +// +// These builders are similar to Arrow's `GenericStringBuilder` and +// `StringViewBuilder`, except that callers must pass the NULL bitmap to +// `finish()`, rather than maintaining it iteratively (per-row). For callers +// that can compute the NULL bitmap in bulk (which is true of many +// string-related UDFs), this can be significantly more efficient. +// +// For a row known to be null, call `append_placeholder` to advance the row +// count without touching the value buffer; the placeholder slot will be +// masked by the caller-supplied null buffer. +// ---------------------------------------------------------------------------- + +/// Builder for a [`GenericStringArray`] that defers null tracking to +/// `finish`. Instantiate with `O = i32` for [`StringArray`] (Utf8) or +/// `O = i64` for [`LargeStringArray`] (LargeUtf8). +pub(crate) struct GenericStringArrayBuilder { + offsets_buffer: MutableBuffer, + value_buffer: MutableBuffer, + _phantom: PhantomData, +} + +impl GenericStringArrayBuilder { + pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { + let capacity = item_capacity + .checked_add(1) + .map(|i| i.saturating_mul(size_of::())) + .expect("capacity integer overflow"); + + let mut offsets_buffer = MutableBuffer::with_capacity(capacity); + offsets_buffer.push(O::usize_as(0)); + Self { + offsets_buffer, + value_buffer: MutableBuffer::with_capacity(data_capacity), + _phantom: PhantomData, + } + } + + /// Append `value` as the next row. + /// + /// # Panics + /// + /// Panics if the cumulative byte length exceeds `O::MAX`. + pub fn append_value(&mut self, value: &str) { + self.value_buffer.extend_from_slice(value.as_bytes()); + let next_offset = + O::from_usize(self.value_buffer.len()).expect("byte array offset overflow"); + self.offsets_buffer.push(next_offset); + } + + /// Append an empty placeholder row. The corresponding slot is meaningful + /// only if the caller masks it via the null buffer passed to `finish`. + pub fn append_placeholder(&mut self) { + let next_offset = + O::from_usize(self.value_buffer.len()).expect("byte array offset overflow"); + self.offsets_buffer.push(next_offset); + } + + /// Finalize into a [`GenericStringArray`] using the caller-supplied + /// null buffer. + /// + /// # Errors + /// + /// Returns an error when `null_buffer.len()` does not match the number of + /// appended rows. + pub fn finish( + self, + null_buffer: Option, + ) -> Result> { + let row_count = self.offsets_buffer.len() / size_of::() - 1; + if let Some(ref n) = null_buffer + && n.len() != row_count + { + return internal_err!( + "Null buffer length ({}) must match row count ({row_count})", + n.len() + ); + } + let array_data = ArrayDataBuilder::new(GenericStringArray::::DATA_TYPE) + .len(row_count) + .add_buffer(self.offsets_buffer.into()) + .add_buffer(self.value_buffer.into()) + .nulls(null_buffer); + // SAFETY: every appended value came from a `&str`, so the value + // buffer is valid UTF-8 and offsets are monotonically non-decreasing. + let array_data = unsafe { array_data.build_unchecked() }; + Ok(GenericStringArray::::from(array_data)) + } +} + +/// Starting size for the long-string data block; matches Arrow's +/// `GenericByteViewBuilder` default. +const STARTING_BLOCK_SIZE: u32 = 8 * 1024; +/// Maximum size each long-string data block grows to; matches Arrow's +/// `GenericByteViewBuilder` default. +const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; + +/// Builder for a [`StringViewArray`] that defers null tracking to `finish`. +/// +/// Modeled on Arrow's [`arrow::array::builder::StringViewBuilder`] but +/// without per-row [`arrow::array::builder::NullBufferBuilder`] maintenance. +/// Long strings (> 12 bytes) are appended into an in-progress data block; +/// short strings are inlined into the view itself. When the in-progress block +/// fills up it is flushed into `completed` and a new block — double the size +/// of the last, capped at [`MAX_BLOCK_SIZE`] — is started. +pub(crate) struct StringViewArrayBuilder { + views: Vec, + in_progress: Vec, + completed: Vec, + /// Current block-size target; doubles each time a block is flushed, up to + /// [`MAX_BLOCK_SIZE`]. + block_size: u32, +} + +impl StringViewArrayBuilder { + pub fn with_capacity(item_capacity: usize) -> Self { + Self { + views: Vec::with_capacity(item_capacity), + in_progress: Vec::new(), + completed: Vec::new(), + block_size: STARTING_BLOCK_SIZE, + } + } + + /// Doubles the block-size target (capped at [`MAX_BLOCK_SIZE`]) and + /// returns the new size. The first call returns `2 * STARTING_BLOCK_SIZE`. + fn next_block_size(&mut self) -> u32 { + if self.block_size < MAX_BLOCK_SIZE { + self.block_size = self.block_size.saturating_mul(2); + } + self.block_size + } + + /// Append `value` as the next row. + /// + /// # Panics + /// + /// Panics if the value length, the in-progress buffer offset, or the + /// number of completed buffers exceeds `i32::MAX`. The ByteView spec + /// uses signed 32-bit integers for these fields; exceeding `i32::MAX` + /// would produce an array that does not round-trip through Arrow IPC + /// (see ). + #[inline] + pub fn append_value(&mut self, value: &str) { + let v = value.as_bytes(); + let length: u32 = i32::try_from(v.len()) + .expect("value length exceeds i32::MAX") as u32; + if length <= 12 { + self.views.push(make_view(v, 0, 0)); + return; + } + + let required_cap = self.in_progress.len() + length as usize; + if self.in_progress.capacity() < required_cap { + self.flush_in_progress(); + let to_reserve = (length as usize).max(self.next_block_size() as usize); + self.in_progress.reserve(to_reserve); + } + + let buffer_index: u32 = i32::try_from(self.completed.len()) + .expect("buffer count exceeds i32::MAX") as u32; + let offset: u32 = i32::try_from(self.in_progress.len()) + .expect("offset exceeds i32::MAX") as u32; + self.in_progress.extend_from_slice(v); + self.views.push(make_view(v, buffer_index, offset)); + } + + /// Append an empty placeholder row. The corresponding slot is meaningful + /// only if the caller masks it via the null buffer passed to `finish`. + #[inline] + pub fn append_placeholder(&mut self) { + // Zero-length inline view — `length` field is 0, no buffer ref. + self.views.push(0); + } + + fn flush_in_progress(&mut self) { + if !self.in_progress.is_empty() { + let block = std::mem::take(&mut self.in_progress); + self.completed.push(Buffer::from_vec(block)); + } + } + + /// Finalize into a [`StringViewArray`] using the caller-supplied null + /// buffer. + /// + /// # Errors + /// + /// Returns an error when `null_buffer.len()` does not match the number of + /// appended rows. + pub fn finish(mut self, null_buffer: Option) -> Result { + if let Some(ref n) = null_buffer + && n.len() != self.views.len() + { + return internal_err!( + "Null buffer length ({}) must match row count ({})", + n.len(), + self.views.len() + ); + } + self.flush_in_progress(); + // SAFETY: every long-string view references bytes we wrote ourselves + // into `self.completed`, with prefixes derived from those same bytes. + // Inline views were built from valid `&str`. Placeholder views are + // zero-length with no buffer reference. + let array = unsafe { + StringViewArray::new_unchecked( + ScalarBuffer::from(self.views), + self.completed, + null_buffer, + ) + }; + Ok(array) + } +} + /// Append a new view to the views buffer with the given substr. /// /// Callers are responsible for their own null tracking. @@ -438,7 +664,7 @@ impl ConcatLargeStringBuilder { /// LLVM is apparently overly eager to inline this function into some hot loops, /// which bloats them and regresses performance, so we disable inlining for now. #[inline(never)] -pub fn append_view( +pub(crate) fn append_view( views_buffer: &mut Vec, original_view: &u128, substr: &str, @@ -459,7 +685,7 @@ pub fn append_view( } #[derive(Debug)] -pub enum ColumnarValueRef<'a> { +pub(crate) enum ColumnarValueRef<'a> { Scalar(&'a [u8]), NullableArray(&'a StringArray), NonNullableArray(&'a StringArray), @@ -518,4 +744,129 @@ mod tests { fn test_overflow_concat_large_string_builder() { let _builder = ConcatLargeStringBuilder::with_capacity(usize::MAX, usize::MAX); } + + #[test] + fn string_array_builder_empty() { + let builder = GenericStringArrayBuilder::::with_capacity(0, 0); + let array = builder.finish(None).unwrap(); + assert_eq!(array.len(), 0); + } + + #[test] + fn string_array_builder_no_nulls() { + let mut builder = GenericStringArrayBuilder::::with_capacity(3, 16); + builder.append_value("foo"); + builder.append_value(""); + builder.append_value("hello world"); + let array = builder.finish(None).unwrap(); + assert_eq!(array.len(), 3); + assert_eq!(array.value(0), "foo"); + assert_eq!(array.value(1), ""); + assert_eq!(array.value(2), "hello world"); + assert_eq!(array.null_count(), 0); + } + + #[test] + fn string_array_builder_with_nulls() { + let mut builder = GenericStringArrayBuilder::::with_capacity(3, 8); + builder.append_value("a"); + builder.append_placeholder(); + builder.append_value("c"); + let nulls = NullBuffer::from(vec![true, false, true]); + let array = builder.finish(Some(nulls)).unwrap(); + assert_eq!(array.len(), 3); + assert_eq!(array.value(0), "a"); + assert!(array.is_null(1)); + assert_eq!(array.value(2), "c"); + } + + #[test] + fn string_array_builder_null_buffer_length_mismatch() { + let mut builder = GenericStringArrayBuilder::::with_capacity(2, 4); + builder.append_value("a"); + builder.append_value("b"); + let nulls = NullBuffer::from(vec![true, false, true]); + assert!(builder.finish(Some(nulls)).is_err()); + } + + #[test] + fn large_string_array_builder_with_nulls() { + let mut builder = GenericStringArrayBuilder::::with_capacity(3, 8); + builder.append_value("a"); + builder.append_placeholder(); + builder.append_value("c"); + let nulls = NullBuffer::from(vec![true, false, true]); + let array = builder.finish(Some(nulls)).unwrap(); + assert_eq!(array.len(), 3); + assert_eq!(array.value(0), "a"); + assert!(array.is_null(1)); + assert_eq!(array.value(2), "c"); + } + + #[test] + fn string_view_array_builder_empty() { + let builder = StringViewArrayBuilder::with_capacity(0); + let array = builder.finish(None).unwrap(); + assert_eq!(array.len(), 0); + } + + #[test] + fn string_view_array_builder_inline_and_buffer() { + let mut builder = StringViewArrayBuilder::with_capacity(3); + builder.append_value("short"); // ≤ 12 bytes, inline + builder.append_value("a string longer than twelve bytes"); + builder.append_value(""); + let array = builder.finish(None).unwrap(); + assert_eq!(array.len(), 3); + assert_eq!(array.value(0), "short"); + assert_eq!(array.value(1), "a string longer than twelve bytes"); + assert_eq!(array.value(2), ""); + } + + #[test] + fn string_view_array_builder_with_nulls() { + let mut builder = StringViewArrayBuilder::with_capacity(4); + builder.append_value("a string longer than twelve bytes"); + builder.append_placeholder(); + builder.append_value("short"); + builder.append_placeholder(); + let nulls = NullBuffer::from(vec![true, false, true, false]); + let array = builder.finish(Some(nulls)).unwrap(); + assert_eq!(array.len(), 4); + assert_eq!(array.value(0), "a string longer than twelve bytes"); + assert!(array.is_null(1)); + assert_eq!(array.value(2), "short"); + assert!(array.is_null(3)); + } + + #[test] + fn string_view_array_builder_null_buffer_length_mismatch() { + let mut builder = StringViewArrayBuilder::with_capacity(2); + builder.append_value("a"); + builder.append_value("b"); + let nulls = NullBuffer::from(vec![true, false, true]); + assert!(builder.finish(Some(nulls)).is_err()); + } + + #[test] + fn string_view_array_builder_flushes_full_blocks() { + // Each value is 300 bytes. The first data block is 2 × STARTING_BLOCK_SIZE + // = 16 KiB, so ~50 values saturate it and the rest spill into additional + // blocks. + let value = "x".repeat(300); + let mut builder = StringViewArrayBuilder::with_capacity(100); + for _ in 0..100 { + builder.append_value(&value); + } + let array = builder.finish(None).unwrap(); + assert_eq!(array.len(), 100); + assert!( + array.data_buffers().len() > 1, + "expected multiple data buffers, got {}", + array.data_buffers().len() + ); + for i in 0..100 { + assert_eq!(array.value(i), value); + } + } } From 2c9bdfc9ce407f4353fbb797560f627c368aab9a Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 22 Apr 2026 12:43:30 -0400 Subject: [PATCH 3/6] cargo fmt --- datafusion/functions/src/strings.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index 2dd3adcb939bb..8336db8b53978 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -576,8 +576,8 @@ impl StringViewArrayBuilder { #[inline] pub fn append_value(&mut self, value: &str) { let v = value.as_bytes(); - let length: u32 = i32::try_from(v.len()) - .expect("value length exceeds i32::MAX") as u32; + let length: u32 = + i32::try_from(v.len()).expect("value length exceeds i32::MAX") as u32; if length <= 12 { self.views.push(make_view(v, 0, 0)); return; @@ -591,7 +591,8 @@ impl StringViewArrayBuilder { } let buffer_index: u32 = i32::try_from(self.completed.len()) - .expect("buffer count exceeds i32::MAX") as u32; + .expect("buffer count exceeds i32::MAX") + as u32; let offset: u32 = i32::try_from(self.in_progress.len()) .expect("offset exceeds i32::MAX") as u32; self.in_progress.extend_from_slice(v); From 445f119e81769a8ab6457c5b4ae2319b9022528e Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 23 Apr 2026 10:17:13 -0400 Subject: [PATCH 4/6] Improve comments, per review --- datafusion/functions/src/string/common.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 4ec02db0ba849..133fe7f023056 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -350,6 +350,7 @@ where DataType::Utf8View => { let string_array = as_string_view_array(array)?; let item_len = string_array.len(); + // Null-preserving: reuse the input null buffer as the output null buffer. let nulls = string_array.nulls().cloned(); let mut builder = StringViewArrayBuilder::with_capacity(item_len); @@ -411,6 +412,7 @@ where // Values contain non-ASCII. let item_len = string_array.len(); let capacity = value_data.len() + PRE_ALLOC_BYTES; + // Null-preserving: reuse the input null buffer as the output null buffer. let nulls = string_array.nulls().cloned(); let mut builder = GenericStringArrayBuilder::::with_capacity(item_len, capacity); @@ -459,7 +461,8 @@ where let values = Buffer::from_vec(bytes); let offsets = string_array.offsets().clone(); let nulls = string_array.nulls().cloned(); - // SAFETY: offsets and nulls are consistent with the input array. + + // SAFETY: we can reuse the offsets and nulls from the input array Ok(Arc::new(unsafe { GenericStringArray::::new_unchecked(offsets, values, nulls) })) From 128443cf3d0d739811acc2cc066a905e43566fc9 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 23 Apr 2026 11:31:24 -0400 Subject: [PATCH 5/6] Check that placeholder count is consistent with null buffer --- datafusion/functions/src/strings.rs | 67 +++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index 8336db8b53978..0024f0fe789d0 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -450,6 +450,7 @@ impl ConcatLargeStringBuilder { pub(crate) struct GenericStringArrayBuilder { offsets_buffer: MutableBuffer, value_buffer: MutableBuffer, + placeholder_count: usize, _phantom: PhantomData, } @@ -465,6 +466,7 @@ impl GenericStringArrayBuilder { Self { offsets_buffer, value_buffer: MutableBuffer::with_capacity(data_capacity), + placeholder_count: 0, _phantom: PhantomData, } } @@ -481,12 +483,13 @@ impl GenericStringArrayBuilder { self.offsets_buffer.push(next_offset); } - /// Append an empty placeholder row. The corresponding slot is meaningful - /// only if the caller masks it via the null buffer passed to `finish`. + /// Append an empty placeholder row. The corresponding slot must be masked + /// as null by the null buffer passed to `finish`. pub fn append_placeholder(&mut self) { let next_offset = O::from_usize(self.value_buffer.len()).expect("byte array offset overflow"); self.offsets_buffer.push(next_offset); + self.placeholder_count += 1; } /// Finalize into a [`GenericStringArray`] using the caller-supplied @@ -509,6 +512,12 @@ impl GenericStringArrayBuilder { n.len() ); } + let null_count = null_buffer.as_ref().map_or(0, |n| n.null_count()); + debug_assert!( + null_count >= self.placeholder_count, + "{} placeholder rows appended but null buffer has {null_count} null(s)", + self.placeholder_count, + ); let array_data = ArrayDataBuilder::new(GenericStringArray::::DATA_TYPE) .len(row_count) .add_buffer(self.offsets_buffer.into()) @@ -543,6 +552,7 @@ pub(crate) struct StringViewArrayBuilder { /// Current block-size target; doubles each time a block is flushed, up to /// [`MAX_BLOCK_SIZE`]. block_size: u32, + placeholder_count: usize, } impl StringViewArrayBuilder { @@ -552,6 +562,7 @@ impl StringViewArrayBuilder { in_progress: Vec::new(), completed: Vec::new(), block_size: STARTING_BLOCK_SIZE, + placeholder_count: 0, } } @@ -599,12 +610,13 @@ impl StringViewArrayBuilder { self.views.push(make_view(v, buffer_index, offset)); } - /// Append an empty placeholder row. The corresponding slot is meaningful - /// only if the caller masks it via the null buffer passed to `finish`. + /// Append an empty placeholder row. The corresponding slot must be + /// masked as null by the null buffer passed to `finish`. #[inline] pub fn append_placeholder(&mut self) { // Zero-length inline view — `length` field is 0, no buffer ref. self.views.push(0); + self.placeholder_count += 1; } fn flush_in_progress(&mut self) { @@ -631,6 +643,12 @@ impl StringViewArrayBuilder { self.views.len() ); } + let null_count = null_buffer.as_ref().map_or(0, |n| n.null_count()); + debug_assert!( + null_count >= self.placeholder_count, + "{} placeholder rows appended but null buffer has {null_count} null(s)", + self.placeholder_count, + ); self.flush_in_progress(); // SAFETY: every long-string view references bytes we wrote ourselves // into `self.completed`, with prefixes derived from those same bytes. @@ -790,6 +808,27 @@ mod tests { assert!(builder.finish(Some(nulls)).is_err()); } + #[test] + #[cfg(debug_assertions)] + #[should_panic(expected = "placeholder rows appended")] + fn string_array_builder_placeholder_without_null_mask() { + let mut builder = GenericStringArrayBuilder::::with_capacity(2, 4); + builder.append_value("a"); + builder.append_placeholder(); + // Slot 1 is a placeholder but the null buffer doesn't mark it null. + let nulls = NullBuffer::from(vec![true, true]); + let _ = builder.finish(Some(nulls)); + } + + #[test] + #[cfg(debug_assertions)] + #[should_panic(expected = "placeholder rows appended")] + fn string_array_builder_placeholder_with_none_null_buffer() { + let mut builder = GenericStringArrayBuilder::::with_capacity(1, 4); + builder.append_placeholder(); + let _ = builder.finish(None); + } + #[test] fn large_string_array_builder_with_nulls() { let mut builder = GenericStringArrayBuilder::::with_capacity(3, 8); @@ -849,6 +888,26 @@ mod tests { assert!(builder.finish(Some(nulls)).is_err()); } + #[test] + #[cfg(debug_assertions)] + #[should_panic(expected = "placeholder rows appended")] + fn string_view_array_builder_placeholder_without_null_mask() { + let mut builder = StringViewArrayBuilder::with_capacity(2); + builder.append_value("a"); + builder.append_placeholder(); + let nulls = NullBuffer::from(vec![true, true]); + let _ = builder.finish(Some(nulls)); + } + + #[test] + #[cfg(debug_assertions)] + #[should_panic(expected = "placeholder rows appended")] + fn string_view_array_builder_placeholder_with_none_null_buffer() { + let mut builder = StringViewArrayBuilder::with_capacity(1); + builder.append_placeholder(); + let _ = builder.finish(None); + } + #[test] fn string_view_array_builder_flushes_full_blocks() { // Each value is 300 bytes. The first data block is 2 × STARTING_BLOCK_SIZE From bff3695d9a1e2ec603d278cf58607a679bae4155 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 23 Apr 2026 11:35:36 -0400 Subject: [PATCH 6/6] Tweak error messages --- datafusion/functions/src/strings.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index 0024f0fe789d0..8564a3e720297 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -515,7 +515,7 @@ impl GenericStringArrayBuilder { let null_count = null_buffer.as_ref().map_or(0, |n| n.null_count()); debug_assert!( null_count >= self.placeholder_count, - "{} placeholder rows appended but null buffer has {null_count} null(s)", + "{} placeholder rows but null buffer has {null_count} nulls", self.placeholder_count, ); let array_data = ArrayDataBuilder::new(GenericStringArray::::DATA_TYPE) @@ -646,7 +646,7 @@ impl StringViewArrayBuilder { let null_count = null_buffer.as_ref().map_or(0, |n| n.null_count()); debug_assert!( null_count >= self.placeholder_count, - "{} placeholder rows appended but null buffer has {null_count} null(s)", + "{} placeholder rows but null buffer has {null_count} nulls", self.placeholder_count, ); self.flush_in_progress(); @@ -810,7 +810,7 @@ mod tests { #[test] #[cfg(debug_assertions)] - #[should_panic(expected = "placeholder rows appended")] + #[should_panic(expected = "placeholder rows")] fn string_array_builder_placeholder_without_null_mask() { let mut builder = GenericStringArrayBuilder::::with_capacity(2, 4); builder.append_value("a"); @@ -822,7 +822,7 @@ mod tests { #[test] #[cfg(debug_assertions)] - #[should_panic(expected = "placeholder rows appended")] + #[should_panic(expected = "placeholder rows")] fn string_array_builder_placeholder_with_none_null_buffer() { let mut builder = GenericStringArrayBuilder::::with_capacity(1, 4); builder.append_placeholder(); @@ -890,7 +890,7 @@ mod tests { #[test] #[cfg(debug_assertions)] - #[should_panic(expected = "placeholder rows appended")] + #[should_panic(expected = "placeholder rows")] fn string_view_array_builder_placeholder_without_null_mask() { let mut builder = StringViewArrayBuilder::with_capacity(2); builder.append_value("a"); @@ -901,7 +901,7 @@ mod tests { #[test] #[cfg(debug_assertions)] - #[should_panic(expected = "placeholder rows appended")] + #[should_panic(expected = "placeholder rows")] fn string_view_array_builder_placeholder_with_none_null_buffer() { let mut builder = StringViewArrayBuilder::with_capacity(1); builder.append_placeholder();