From 0104e51fd2f5cf02a81e0b18b0ddbe64532720a1 Mon Sep 17 00:00:00 2001 From: Qingping Hou Date: Sun, 13 Jun 2021 10:51:05 -0700 Subject: [PATCH] hash float arrays using primitive usigned integer type Generate hash using u8 slices will be less efficient and breaks sepcialization in ahash. See https://github.com/tkaitchuck/aHash/issues/93 for more details. --- datafusion/src/physical_plan/hash_join.rs | 36 ++++++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 1b0322b521a5..644d2d486c85 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -877,13 +877,19 @@ macro_rules! hash_array_float { if $multi_col { for (hash, value) in $hashes.iter_mut().zip(values.iter()) { *hash = combine_hashes( - $ty::get_hash(&value.to_le_bytes(), $random_state), + $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ), *hash, ); } } else { for (hash, value) in $hashes.iter_mut().zip(values.iter()) { - *hash = $ty::get_hash(&value.to_le_bytes(), $random_state) + *hash = $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ) } } } else { @@ -893,7 +899,10 @@ macro_rules! hash_array_float { { if !array.is_null(i) { *hash = combine_hashes( - $ty::get_hash(&value.to_le_bytes(), $random_state), + $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ), *hash, ); } @@ -903,7 +912,10 @@ macro_rules! hash_array_float { $hashes.iter_mut().zip(values.iter()).enumerate() { if !array.is_null(i) { - *hash = $ty::get_hash(&value.to_le_bytes(), $random_state); + *hash = $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ); } } } @@ -1838,6 +1850,22 @@ mod tests { Ok(()) } + #[test] + fn create_hashes_for_float_arrays() -> Result<()> { + let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7])); + let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7])); + + let random_state = RandomState::with_seeds(0, 0, 0, 0); + let hashes_buff = &mut vec![0; f32_arr.len()]; + let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?; + assert_eq!(hashes.len(), 4,); + + let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?; + assert_eq!(hashes.len(), 4,); + + Ok(()) + } + #[test] fn join_with_hash_collision() -> Result<()> { let mut hashmap_left = HashMap::with_capacity_and_hasher(2, IdHashBuilder {});