Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 121 additions & 2 deletions datafusion/common/src/hash_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ macro_rules! hash_value {
})+
};
}
hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
hash_value!(i8, i16, i32, i64, u128, i128, i256, u8, u16, u32, u64);
hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);

macro_rules! hash_float_value {
Expand Down Expand Up @@ -221,6 +221,125 @@ fn hash_array_primitive<T>(
}
}

/// Hash a string view array.
///
/// Templated to optimize inner loop based on presence of nulls and external buffers.
///
/// HAS_NULLS: do we have to check in the inner loop
/// HAS_BUFFERS: if true, array has external buffers; if false, all strings are inlined/ less then 12 bytes
/// REHASH: if true, combining with existing hash, otherwise initializing
#[inline(never)]
fn hash_string_view_array_inner<
const HAS_NULLS: bool,
const HAS_BUFFERS: bool,
const REHASH: bool,
>(
array: &StringViewArray,
random_state: &RandomState,
hashes_buffer: &mut [u64],
) {
assert_eq!(
hashes_buffer.len(),
array.len(),
"hashes_buffer and array should be of equal length"
);

for (i, hash) in hashes_buffer.iter_mut().enumerate() {
if HAS_NULLS && array.is_null(i) {
continue;
}
// SAFETY: length is verified above so i is within bounds
let v = unsafe { *array.views().get_unchecked(i) };
let view_len = v as u32;
// all views are inlined, no need to access external buffers
if !HAS_BUFFERS || view_len <= 12 {
if REHASH {
*hash = combine_hashes(v.hash_one(random_state), *hash);
} else {
*hash = v.hash_one(random_state);
}
continue;
}
// view is not inlined, so we need to hash the bytes as well
let view = ByteView::from(v);
let data = unsafe {
array
.data_buffers()
.get_unchecked(view.buffer_index as usize)
};
let offset = view.offset as usize;
// SAFETY: view is a valid view as it came from the array
let view_bytes =
unsafe { data.get_unchecked(offset..offset + view_len as usize) };
if REHASH {
*hash = combine_hashes(view_bytes.hash_one(random_state), *hash);
} else {
*hash = view_bytes.hash_one(random_state);
}
}
}

/// Builds hash values for array views and writes them into `hashes_buffer`
/// If `rehash==true` this combines the previous hash value in the buffer
/// with the new hash using `combine_hashes`
///
/// TODO: make general for butesview as well
#[cfg(not(feature = "force_hash_collisions"))]
fn hash_string_view_array(
array: &StringViewArray,
random_state: &RandomState,
hashes_buffer: &mut [u64],
rehash: bool,
) {
// instantiate the correct version based on presence of nulls and external buffers
match (
array.null_count() != 0,
!array.data_buffers().is_empty(),
rehash,
) {
(false, false, false) => hash_string_view_array_inner::<false, false, false>(
array,
random_state,
hashes_buffer,
),
(false, false, true) => hash_string_view_array_inner::<false, false, true>(
array,
random_state,
hashes_buffer,
),
(false, true, false) => hash_string_view_array_inner::<false, true, false>(
array,
random_state,
hashes_buffer,
),
(false, true, true) => hash_string_view_array_inner::<false, true, true>(
array,
random_state,
hashes_buffer,
),
(true, false, false) => hash_string_view_array_inner::<true, false, false>(
array,
random_state,
hashes_buffer,
),
(true, false, true) => hash_string_view_array_inner::<true, false, true>(
array,
random_state,
hashes_buffer,
),
(true, true, false) => hash_string_view_array_inner::<true, true, false>(
array,
random_state,
hashes_buffer,
),
(true, true, true) => hash_string_view_array_inner::<true, true, true>(
array,
random_state,
hashes_buffer,
),
}
}

/// Hashes one array into the `hashes_buffer`
/// If `rehash==true` this combines the previous hash value in the buffer
/// with the new hash using `combine_hashes`
Expand Down Expand Up @@ -568,7 +687,7 @@ fn hash_single_array(
DataType::Null => hash_null(random_state, hashes_buffer, rehash),
DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash),
DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash),
DataType::Utf8View => hash_array(&as_string_view_array(array)?, random_state, hashes_buffer, rehash),
DataType::Utf8View => hash_string_view_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash),
DataType::Binary => hash_array(&as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
DataType::BinaryView => hash_array(&as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,38 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
self.do_append_val_inner(arr, row);
}

fn vectorized_equal_to_inner(
fn vectorized_equal_to_inner_no_nulls(
&self,
lhs_rows: &[usize],
array: &ArrayRef,
array: &GenericByteViewArray<B>,
rhs_rows: &[usize],
equal_to_results: &mut [bool],
) {
let array = array.as_byte_view::<B>();
let iter = izip!(
lhs_rows.iter(),
rhs_rows.iter(),
equal_to_results.iter_mut(),
);

// if the input array has no nulls, can skip null check
for (&lhs_row, &rhs_row, equal_to_result) in iter {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we can get this to compile to a well-vectorized loop.

// Has found not equal to, don't need to check
if !*equal_to_result {
continue;
}

*equal_to_result =
self.do_equal_to_inner_values_only(lhs_row, array, rhs_row);
}
}

fn vectorized_equal_to_inner_nulls(
&self,
lhs_rows: &[usize],
array: &GenericByteViewArray<B>,
rhs_rows: &[usize],
equal_to_results: &mut [bool],
) {
let iter = izip!(
lhs_rows.iter(),
rhs_rows.iter(),
Expand Down Expand Up @@ -226,14 +249,26 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
let exist_null = self.nulls.is_null(lhs_row);
let input_null = array.is_null(rhs_row);
if let Some(result) = nulls_equal_to(exist_null, input_null) {
return result;
result
} else {
self.do_equal_to_inner_values_only(lhs_row, array, rhs_row)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps this could be optimized as well (in the outer loop) for empty data buffers

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will give it a try

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's move this conversation to #19364

}
}

// Otherwise, we need to check their values
let exist_view = self.views[lhs_row];
/// Same as equal_to_inner, but without null checks
///
/// Checks only the values for equality
#[inline(always)]
fn do_equal_to_inner_values_only(
&self,
lhs_row: usize,
array: &GenericByteViewArray<B>,
rhs_row: usize,
) -> bool {
let exist_view = unsafe { *self.views.get_unchecked(lhs_row) };
let exist_view_len = exist_view as u32;

let input_view = array.views()[rhs_row];
let input_view = unsafe { *array.views().get_unchecked(rhs_row) };
let input_view_len = input_view as u32;

// The check logic
Expand All @@ -246,19 +281,9 @@ impl<B: ByteViewType> ByteViewGroupValueBuilder<B> {
}

if exist_view_len <= 12 {
let exist_inline = unsafe {
GenericByteViewArray::<B>::inline_value(
&exist_view,
exist_view_len as usize,
)
};
let input_inline = unsafe {
GenericByteViewArray::<B>::inline_value(
&input_view,
input_view_len as usize,
)
};
exist_inline == input_inline
// the views are inlined and the lengths are equal, so just
// compare the views directly
exist_view == input_view
} else {
let exist_prefix =
unsafe { GenericByteViewArray::<B>::inline_value(&exist_view, 4) };
Expand Down Expand Up @@ -507,7 +532,22 @@ impl<B: ByteViewType> GroupColumn for ByteViewGroupValueBuilder<B> {
rows: &[usize],
equal_to_results: &mut [bool],
) {
self.vectorized_equal_to_inner(group_indices, array, rows, equal_to_results);
let array = array.as_byte_view::<B>();
if array.null_count() == 0 {
self.vectorized_equal_to_inner_no_nulls(
group_indices,
array,
rows,
equal_to_results,
);
} else {
self.vectorized_equal_to_inner_nulls(
group_indices,
array,
rows,
equal_to_results,
);
}
}

fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize]) -> Result<()> {
Expand Down