From d48d1a43cf2a8b31e482d0495c737582f20e06f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 19 May 2026 07:51:10 +0200 Subject: [PATCH] perf: combine_hashes for byte-view multi-column rehash Aligns the byte-view paths (Utf8View / BinaryView) in hash_string_view_array_inner and hash_generic_byte_view_array's no-buffers rehash branch with the pattern already used by the generic variable-width path (hash_array) and the dictionary path (hash_dictionary_array): hash the current value once with the query RandomState and combine it with the previous row hash via combine_hashes, instead of constructing a fresh foldhash::SeedableRandomState per row that folds the seed into a new hasher state. Measured on ClickBench q16 and q17 (`GROUP BY UserID, SearchPhrase`, partitioned dataset, dfbench, current main baselines 1106.69 / 1127.64 ms over 3 iterations): patched 3-iteration runs settled around 986.76 ms (q16, -10.8%) and 1002.29 ms (q17, -11.1%). Validation: - cargo test -p datafusion-common hash_utils - cargo clippy -p datafusion-common --all-targets --all-features -- -D warnings --- datafusion/common/src/hash_utils.rs | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index fcc2e919b6cc2..2103f9855af0f 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -345,9 +345,7 @@ fn hash_string_view_array_inner< // all views are inlined, no need to access external buffers if !HAS_BUFFERS || view_len <= 12 { if REHASH { - let mut hasher = seeded_state(*hash).build_hasher(); - v.hash_write(&mut hasher); - *hash = hasher.finish(); + *hash = combine_hashes(v.hash_one(random_state), *hash); } else { *hash = v.hash_one(random_state); } @@ -356,9 +354,7 @@ fn hash_string_view_array_inner< // view is not inlined, so we need to hash the bytes as well let value = view_bytes(view_len, v); if REHASH { - let mut hasher = seeded_state(*hash).build_hasher(); - value.hash_write(&mut hasher); - *hash = hasher.finish(); + *hash = combine_hashes(value.hash_one(random_state), *hash); } else { *hash = value.hash_one(random_state); } @@ -390,9 +386,7 @@ fn hash_generic_byte_view_array( } (false, false, true) => { for (hash, &view) in hashes_buffer.iter_mut().zip(array.views().iter()) { - let mut hasher = seeded_state(*hash).build_hasher(); - view.hash_write(&mut hasher); - *hash = hasher.finish(); + *hash = combine_hashes(view.hash_one(random_state), *hash); } } (false, true, false) => hash_string_view_array_inner::(