From 62bc4b28e3c777d5f8eb8197ea54d6c41abe87a3 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sat, 25 Apr 2026 09:44:32 -0400 Subject: [PATCH 1/3] perf: Use bulk-NULL builder in substr_index --- .../functions/src/unicode/substrindex.rs | 132 ++++++++++-------- 1 file changed, 71 insertions(+), 61 deletions(-) diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs index c04656282d942..749c9f5210a2e 100644 --- a/datafusion/functions/src/unicode/substrindex.rs +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -18,11 +18,13 @@ use std::sync::Arc; use arrow::array::{ - ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, - GenericStringBuilder, OffsetSizeTrait, PrimitiveArray, + Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, + PrimitiveArray, }; +use arrow::buffer::NullBuffer; use arrow::datatypes::{DataType, Int32Type, Int64Type}; +use crate::strings::GenericStringArrayBuilder; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{Result, exec_err, utils::take_function_args}; use datafusion_expr::TypeSignature::Exact; @@ -176,71 +178,79 @@ where T::Native: OffsetSizeTrait, { let num_rows = string_array.len(); - let mut builder = GenericStringBuilder::::with_capacity(num_rows, 0); - let string_iter = ArrayIter::new(string_array); - let delimiter_array_iter = ArrayIter::new(delimiter_array); - let count_array_iter = ArrayIter::new(count_array); - string_iter - .zip(delimiter_array_iter) - .zip(count_array_iter) - .for_each(|((string, delimiter), n)| match (string, delimiter, n) { - (Some(string), Some(delimiter), Some(n)) => { - // In MySQL, these cases will return an empty string. - if n == 0 || string.is_empty() || delimiter.is_empty() { - builder.append_value(""); - return; - } + let mut builder = GenericStringArrayBuilder::::with_capacity(num_rows, 0); + // Output is null IFF any input row is null. Combine the input null + // buffers in bulk rather than tracking nulls per row in the builder. + let nulls = NullBuffer::union( + NullBuffer::union(string_array.nulls(), delimiter_array.nulls()).as_ref(), + count_array.nulls(), + ); + + for i in 0..num_rows { + if nulls.as_ref().is_some_and(|n| n.is_null(i)) { + builder.append_placeholder(); + continue; + } + // SAFETY: `i < num_rows`, and the union of input nulls is non-null at i, + // so each input is also non-null at i. + let string = unsafe { string_array.value_unchecked(i) }; + let delimiter = unsafe { delimiter_array.value_unchecked(i) }; + let n = unsafe { count_array.value_unchecked(i) }; + + // In MySQL, these cases will return an empty string. + if n == 0 || string.is_empty() || delimiter.is_empty() { + builder.append_value(""); + continue; + } - let occurrences = usize::try_from(n.unsigned_abs()).unwrap_or(usize::MAX); - let result_idx = if delimiter.len() == 1 { - // Fast path: use byte-level search for single-character delimiters - let d_byte = delimiter.as_bytes()[0]; - let bytes = string.as_bytes(); + let occurrences = usize::try_from(n.unsigned_abs()).unwrap_or(usize::MAX); + let result_idx = if delimiter.len() == 1 { + // Fast path: use byte-level search for single-character delimiters + let d_byte = delimiter.as_bytes()[0]; + let bytes = string.as_bytes(); - if n > 0 { - bytes - .iter() - .enumerate() - .filter(|&(_, &b)| b == d_byte) - .nth(occurrences - 1) - .map(|(idx, _)| idx) - } else { - bytes - .iter() - .enumerate() - .rev() - .filter(|&(_, &b)| b == d_byte) - .nth(occurrences - 1) - .map(|(idx, _)| idx + 1) - } - } else if n > 0 { - // Multi-byte path: forward search for n-th occurrence - string - .match_indices(delimiter) - .nth(occurrences - 1) - .map(|(idx, _)| idx) + if n > 0 { + bytes + .iter() + .enumerate() + .filter(|&(_, &b)| b == d_byte) + .nth(occurrences - 1) + .map(|(idx, _)| idx) + } else { + bytes + .iter() + .enumerate() + .rev() + .filter(|&(_, &b)| b == d_byte) + .nth(occurrences - 1) + .map(|(idx, _)| idx + 1) + } + } else if n > 0 { + // Multi-byte path: forward search for n-th occurrence + string + .match_indices(delimiter) + .nth(occurrences - 1) + .map(|(idx, _)| idx) + } else { + // Multi-byte path: backward search for n-th occurrence from the right + string + .rmatch_indices(delimiter) + .nth(occurrences - 1) + .map(|(idx, _)| idx + delimiter.len()) + }; + match result_idx { + Some(idx) => { + if n > 0 { + builder.append_value(&string[..idx]); } else { - // Multi-byte path: backward search for n-th occurrence from the right - string - .rmatch_indices(delimiter) - .nth(occurrences - 1) - .map(|(idx, _)| idx + delimiter.len()) - }; - match result_idx { - Some(idx) => { - if n > 0 { - builder.append_value(&string[..idx]); - } else { - builder.append_value(&string[idx..]); - } - } - None => builder.append_value(string), + builder.append_value(&string[idx..]); } } - _ => builder.append_null(), - }); + None => builder.append_value(string), + } + } - Ok(Arc::new(builder.finish()) as ArrayRef) + Ok(Arc::new(builder.finish(nulls)?) as ArrayRef) } #[cfg(test)] From 9a39a082eaa7d85c8de259b2fcd2303ce16c8765 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 27 Apr 2026 10:25:25 -0400 Subject: [PATCH 2/3] Convert other call-site to use new builder --- .../functions/src/unicode/substrindex.rs | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs index f0636c4fc9bc6..1d077777b5c48 100644 --- a/datafusion/functions/src/unicode/substrindex.rs +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -18,9 +18,8 @@ use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, AsArray, ByteView, GenericStringArray, GenericStringBuilder, - OffsetSizeTrait, PrimitiveArray, StringArrayType, StringLikeArrayBuilder, - StringViewArray, make_view, new_null_array, + Array, ArrayRef, AsArray, ByteView, GenericStringArray, OffsetSizeTrait, + PrimitiveArray, StringArrayType, StringViewArray, make_view, new_null_array, }; use arrow::buffer::ScalarBuffer; use arrow::datatypes::{DataType, Int64Type}; @@ -230,7 +229,7 @@ fn substr_index_scalar( arr, delimiter, count, - GenericStringBuilder::::with_capacity( + GenericStringArrayBuilder::::with_capacity( arr.len(), visible_string_bytes(arr), ), @@ -242,7 +241,7 @@ fn substr_index_scalar( arr, delimiter, count, - GenericStringBuilder::::with_capacity( + GenericStringArrayBuilder::::with_capacity( arr.len(), visible_string_bytes(arr), ), @@ -273,8 +272,7 @@ where O: OffsetSizeTrait, { let num_rows = string_array.len(); - // Output is null IFF any input row is null. Combine the input null - // buffers in bulk rather than tracking nulls per row in the builder. + // Output is null IFF any input is null. let nulls = NullBuffer::union( NullBuffer::union(string_array.nulls(), delimiter_array.nulls()).as_ref(), count_array.nulls(), @@ -285,8 +283,8 @@ where builder.append_placeholder(); continue; } - // SAFETY: `i < num_rows`, and the union of input nulls is non-null at i, - // so each input is also non-null at i. + // SAFETY: `i < num_rows`, and the union of input nulls is valid at i, + // so each input is also valid at i. let string = unsafe { string_array.value_unchecked(i) }; let delimiter = unsafe { delimiter_array.value_unchecked(i) }; let n = unsafe { count_array.value_unchecked(i) }; @@ -341,15 +339,15 @@ fn substr_index_view( } } -fn substr_index_scalar_impl<'a, S, B>( +fn substr_index_scalar_impl<'a, S, O>( string_array: S, delimiter: &str, count: i64, - builder: B, + builder: GenericStringArrayBuilder, ) -> Result where S: StringArrayType<'a> + Copy, - B: StringLikeArrayBuilder, + O: OffsetSizeTrait, { if count == 0 || delimiter.is_empty() { return map_strings(string_array, builder, |string| &string[..0]); @@ -474,19 +472,28 @@ fn substr_index_scalar_view( } } -fn map_strings<'a, S, B, F>(string_array: S, mut builder: B, f: F) -> Result +fn map_strings<'a, S, O, F>( + string_array: S, + mut builder: GenericStringArrayBuilder, + f: F, +) -> Result where S: StringArrayType<'a> + Copy, - B: StringLikeArrayBuilder, + O: OffsetSizeTrait, F: Fn(&'a str) -> &'a str, { - for string in string_array.iter() { - match string { - Some(s) => builder.append_value(f(s)), - None => builder.append_null(), + let nulls = string_array.nulls().cloned(); + for i in 0..string_array.len() { + if nulls.as_ref().is_some_and(|n| n.is_null(i)) { + builder.append_placeholder(); + continue; } + // SAFETY: `i < string_array.len()`, and `nulls` is valid at i, so the + // input is also valid at i. + let s = unsafe { string_array.value_unchecked(i) }; + builder.append_value(f(s)); } - Ok(Arc::new(builder.finish()) as ArrayRef) + Ok(Arc::new(builder.finish(nulls)?) as ArrayRef) } #[inline] From 61867dbac525420e5b43333cdbf428f67965fd55 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 27 Apr 2026 11:26:37 -0400 Subject: [PATCH 3/3] Tweak comments --- datafusion/functions/src/unicode/substrindex.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs index 1d077777b5c48..d522a358425f6 100644 --- a/datafusion/functions/src/unicode/substrindex.rs +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -283,7 +283,7 @@ where builder.append_placeholder(); continue; } - // SAFETY: `i < num_rows`, and the union of input nulls is valid at i, + // SAFETY: `i < num_rows` and the union of input nulls is valid at i, // so each input is also valid at i. let string = unsafe { string_array.value_unchecked(i) }; let delimiter = unsafe { delimiter_array.value_unchecked(i) }; @@ -488,7 +488,7 @@ where builder.append_placeholder(); continue; } - // SAFETY: `i < string_array.len()`, and `nulls` is valid at i, so the + // SAFETY: `i < string_array.len()` and `nulls` is valid at i, so the // input is also valid at i. let s = unsafe { string_array.value_unchecked(i) }; builder.append_value(f(s));