From 47124512b710c15fbf4094f5c68b4e164cb1cea8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 19 May 2026 07:28:51 +0200 Subject: [PATCH] perf: combine_hashes for primitive multi-column rehash Aligns hash_array_primitive's rehash path with the existing pattern used by hash_array (generic variable-width) and hash_dictionary_array: hash the current value once with the query RandomState and combine it with the previous row hash via combine_hashes, instead of constructing a fresh foldhash::SeedableRandomState per row to fold the seed in. Measured on ClickBench q32 (`GROUP BY WatchID, ClientIP ORDER BY COUNT(*) DESC LIMIT 10`, partitioned dataset, dfbench, current main baseline 3988.27 ms over 3 iterations): a 10-iteration same-machine run with this change settled around 1.7 s with a warm tail of 1.4-1.5 s. Validation: - cargo test -p datafusion-common hash_utils - cargo clippy -p datafusion-common --all-targets --all-features -- -D warnings --- datafusion/common/src/hash_utils.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index fcc2e919b6cc2..3a6b97df233ea 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -211,8 +211,8 @@ fn seeded_state(seed: u64) -> foldhash::fast::SeedableRandomState { } /// Builds hash values of PrimitiveArray and writes them into `hashes_buffer` -/// If `rehash==true` this folds the existing hash into the hasher state -/// and hashes only the new value (avoiding a separate combine step). +/// If `rehash==true` this combines the previous hash value in the buffer +/// with the new hash using `combine_hashes`. #[cfg(not(feature = "force_hash_collisions"))] fn hash_array_primitive( array: &PrimitiveArray, @@ -231,9 +231,7 @@ fn hash_array_primitive( if array.null_count() == 0 { if rehash { for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) { - let mut hasher = seeded_state(*hash).build_hasher(); - value.hash_write(&mut hasher); - *hash = hasher.finish(); + *hash = combine_hashes(value.hash_one(random_state), *hash); } } else { for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) { @@ -243,9 +241,8 @@ fn hash_array_primitive( } else if rehash { for i in array.nulls().unwrap().valid_indices() { let value = unsafe { array.value_unchecked(i) }; - let mut hasher = seeded_state(hashes_buffer[i]).build_hasher(); - value.hash_write(&mut hasher); - hashes_buffer[i] = hasher.finish(); + hashes_buffer[i] = + combine_hashes(value.hash_one(random_state), hashes_buffer[i]); } } else { for i in array.nulls().unwrap().valid_indices() {