-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Refactor InListExpr to support structs by re-using existing hashing infrastructure #18449
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
8f83f9c
Refactor InListExpr to support structs by re-using existing hashing i…
adriangb cd9e2f5
remove enum comparator
adriangb 896820e
use const thread local
adriangb 621cfe5
Consolidate StaticFilter and ArrayHashSet (#44)
alamb 8a2ee06
fix rebase
adriangb 06a4763
Add specialized sets for primitive types
alamb File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,8 +31,8 @@ use crate::cast::{ | |
| as_string_array, as_string_view_array, as_struct_array, | ||
| }; | ||
| use crate::error::Result; | ||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
| use crate::error::_internal_err; | ||
| use crate::error::{_internal_datafusion_err, _internal_err}; | ||
| use std::cell::RefCell; | ||
|
|
||
| // Combines two hashes into one hash | ||
| #[inline] | ||
|
|
@@ -41,6 +41,94 @@ pub fn combine_hashes(l: u64, r: u64) -> u64 { | |
| hash.wrapping_mul(37).wrapping_add(r) | ||
| } | ||
|
|
||
| /// Maximum size for the thread-local hash buffer before truncation (4MB = 524,288 u64 elements). | ||
| /// The goal of this is to avoid unbounded memory growth that would appear as a memory leak. | ||
| /// We allow temporary allocations beyond this size, but after use the buffer is truncated | ||
| /// to this size. | ||
| const MAX_BUFFER_SIZE: usize = 524_288; | ||
|
|
||
| thread_local! { | ||
| /// Thread-local buffer for hash computations to avoid repeated allocations. | ||
| /// The buffer is reused across calls and truncated if it exceeds MAX_BUFFER_SIZE. | ||
| /// Defaults to a capacity of 8192 u64 elements which is the default batch size. | ||
| /// This corresponds to 64KB of memory. | ||
| static HASH_BUFFER: RefCell<Vec<u64>> = const { RefCell::new(Vec::new()) }; | ||
| } | ||
|
|
||
| /// Creates hashes for the given arrays using a thread-local buffer, then calls the provided callback | ||
| /// with an immutable reference to the computed hashes. | ||
| /// | ||
| /// This function manages a thread-local buffer to avoid repeated allocations. The buffer is automatically | ||
| /// truncated if it exceeds `MAX_BUFFER_SIZE` after use. | ||
| /// | ||
| /// # Arguments | ||
| /// * `arrays` - The arrays to hash (must contain at least one array) | ||
| /// * `random_state` - The random state for hashing | ||
| /// * `callback` - A function that receives an immutable reference to the hash slice and returns a result | ||
| /// | ||
| /// # Errors | ||
| /// Returns an error if: | ||
| /// - No arrays are provided | ||
| /// - The function is called reentrantly (i.e., the callback invokes `with_hashes` again on the same thread) | ||
| /// - The function is called during or after thread destruction | ||
| /// | ||
| /// # Example | ||
| /// ```ignore | ||
| /// use datafusion_common::hash_utils::{with_hashes, RandomState}; | ||
| /// use arrow::array::{Int32Array, ArrayRef}; | ||
| /// use std::sync::Arc; | ||
| /// | ||
| /// let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); | ||
| /// let random_state = RandomState::new(); | ||
| /// | ||
| /// let result = with_hashes([&array], &random_state, |hashes| { | ||
| /// // Use the hashes here | ||
| /// Ok(hashes.len()) | ||
| /// })?; | ||
| /// ``` | ||
| pub fn with_hashes<I, T, F, R>( | ||
| arrays: I, | ||
| random_state: &RandomState, | ||
| callback: F, | ||
| ) -> Result<R> | ||
| where | ||
| I: IntoIterator<Item = T>, | ||
| T: AsDynArray, | ||
| F: FnOnce(&[u64]) -> Result<R>, | ||
| { | ||
| // Peek at the first array to determine buffer size without fully collecting | ||
| let mut iter = arrays.into_iter().peekable(); | ||
|
|
||
| // Get the required size from the first array | ||
| let required_size = match iter.peek() { | ||
| Some(arr) => arr.as_dyn_array().len(), | ||
| None => return _internal_err!("with_hashes requires at least one array"), | ||
| }; | ||
|
|
||
| HASH_BUFFER.try_with(|cell| { | ||
| let mut buffer = cell.try_borrow_mut() | ||
| .map_err(|_| _internal_datafusion_err!("with_hashes cannot be called reentrantly on the same thread"))?; | ||
|
|
||
| // Ensure buffer has sufficient length, clearing old values | ||
| buffer.clear(); | ||
| buffer.resize(required_size, 0); | ||
|
|
||
| // Create hashes in the buffer - this consumes the iterator | ||
| create_hashes(iter, random_state, &mut buffer[..required_size])?; | ||
|
|
||
| // Execute the callback with an immutable slice | ||
| let result = callback(&buffer[..required_size])?; | ||
|
|
||
| // Cleanup: truncate if buffer grew too large | ||
| if buffer.capacity() > MAX_BUFFER_SIZE { | ||
| buffer.truncate(MAX_BUFFER_SIZE); | ||
| buffer.shrink_to_fit(); | ||
| } | ||
|
|
||
| Ok(result) | ||
| }).map_err(|_| _internal_datafusion_err!("with_hashes cannot access thread-local storage during or after thread destruction"))? | ||
| } | ||
|
|
||
| #[cfg(not(feature = "force_hash_collisions"))] | ||
| fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) { | ||
| if mul_col { | ||
|
|
@@ -478,8 +566,8 @@ impl AsDynArray for &ArrayRef { | |
| pub fn create_hashes<'a, I, T>( | ||
| arrays: I, | ||
| random_state: &RandomState, | ||
| hashes_buffer: &'a mut Vec<u64>, | ||
| ) -> Result<&'a mut Vec<u64>> | ||
| hashes_buffer: &'a mut [u64], | ||
| ) -> Result<&'a mut [u64]> | ||
| where | ||
| I: IntoIterator<Item = T>, | ||
| T: AsDynArray, | ||
|
|
@@ -522,7 +610,7 @@ mod tests { | |
| fn create_hashes_for_empty_fixed_size_lit() -> Result<()> { | ||
| let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish(); | ||
| let random_state = RandomState::with_seeds(0, 0, 0, 0); | ||
| let hashes_buff = &mut vec![0; 0]; | ||
| let hashes_buff = &mut [0; 0]; | ||
| let hashes = create_hashes( | ||
| &[Arc::new(empty_array) as ArrayRef], | ||
| &random_state, | ||
|
|
@@ -1000,4 +1088,84 @@ mod tests { | |
|
|
||
| assert_eq!(hashes1, hashes2); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_with_hashes() { | ||
| let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); | ||
| let random_state = RandomState::with_seeds(0, 0, 0, 0); | ||
|
|
||
| // Test that with_hashes produces the same results as create_hashes | ||
| let mut expected_hashes = vec![0; array.len()]; | ||
| create_hashes([&array], &random_state, &mut expected_hashes).unwrap(); | ||
|
|
||
| let result = with_hashes([&array], &random_state, |hashes| { | ||
| assert_eq!(hashes.len(), 4); | ||
| // Verify hashes match expected values | ||
| assert_eq!(hashes, &expected_hashes[..]); | ||
| // Return a copy of the hashes | ||
| Ok(hashes.to_vec()) | ||
| }) | ||
| .unwrap(); | ||
|
|
||
| // Verify callback result is returned correctly | ||
| assert_eq!(result, expected_hashes); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_with_hashes_multi_column() { | ||
| let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); | ||
| let str_array: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"])); | ||
| let random_state = RandomState::with_seeds(0, 0, 0, 0); | ||
|
|
||
| // Test multi-column hashing | ||
| let mut expected_hashes = vec![0; int_array.len()]; | ||
| create_hashes( | ||
| [&int_array, &str_array], | ||
| &random_state, | ||
| &mut expected_hashes, | ||
| ) | ||
| .unwrap(); | ||
|
|
||
| with_hashes([&int_array, &str_array], &random_state, |hashes| { | ||
| assert_eq!(hashes.len(), 3); | ||
| assert_eq!(hashes, &expected_hashes[..]); | ||
| Ok(()) | ||
| }) | ||
| .unwrap(); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_with_hashes_empty_arrays() { | ||
| let random_state = RandomState::with_seeds(0, 0, 0, 0); | ||
|
|
||
| // Test that passing no arrays returns an error | ||
| let empty: [&ArrayRef; 0] = []; | ||
| let result = with_hashes(empty, &random_state, |_hashes| Ok(())); | ||
|
|
||
| assert!(result.is_err()); | ||
| assert!(result | ||
| .unwrap_err() | ||
| .to_string() | ||
| .contains("requires at least one array")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_with_hashes_reentrancy() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please add a test / verify the truncate / shrink to fit behavior ? I think that is probably important |
||
| let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); | ||
| let array2: ArrayRef = Arc::new(Int32Array::from(vec![4, 5, 6])); | ||
| let random_state = RandomState::with_seeds(0, 0, 0, 0); | ||
|
|
||
| // Test that reentrant calls return an error instead of panicking | ||
| let result = with_hashes([&array], &random_state, |_hashes| { | ||
| // Try to call with_hashes again inside the callback | ||
| with_hashes([&array2], &random_state, |_inner_hashes| Ok(())) | ||
| }); | ||
|
|
||
| assert!(result.is_err()); | ||
| let err_msg = result.unwrap_err().to_string(); | ||
| assert!( | ||
| err_msg.contains("reentrantly") || err_msg.contains("cannot be called"), | ||
| "Error message should mention reentrancy: {err_msg}", | ||
| ); | ||
| } | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I double checked that the code already asserts that the hashes_buffer and arrays are the same length (aka doesn't actually use the fact this is a Vec to grow the allocation)