From e7a7d51854a6bc4e762755c08db1c68bf7cfd87d Mon Sep 17 00:00:00 2001 From: Patrick More Date: Fri, 30 Apr 2021 21:56:57 -0700 Subject: [PATCH 1/8] Added boolean support for count distinct. --- datafusion/src/scalar.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 833f707e971e..6f03194f4542 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -345,6 +345,7 @@ impl ScalarValue { ), }, ScalarValue::List(values, data_type) => Arc::new(match data_type { + DataType::Boolean => build_list!(BooleanBuilder, Boolean, values, size), DataType::Int8 => build_list!(Int8Builder, Int8, values, size), DataType::Int16 => build_list!(Int16Builder, Int16, values, size), DataType::Int32 => build_list!(Int32Builder, Int32, values, size), From a93a98fc9fbb3ef1c200433a77dc52a6d265a3ff Mon Sep 17 00:00:00 2001 From: Patrick More Date: Fri, 30 Apr 2021 22:07:50 -0700 Subject: [PATCH 2/8] Added boolean support for COUNT DISTINCT --- datafusion/src/scalar.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 833f707e971e..2572d61a167d 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -345,6 +345,7 @@ impl ScalarValue { ), }, ScalarValue::List(values, data_type) => Arc::new(match data_type { + DataType::Boolean => typed_cast!(array, index, BooleanArray, Boolean), DataType::Int8 => build_list!(Int8Builder, Int8, values, size), DataType::Int16 => build_list!(Int16Builder, Int16, values, size), DataType::Int32 => build_list!(Int32Builder, Int32, values, size), From e1114cb2e9cffab817abcefb0db57c1e9e09a540 Mon Sep 17 00:00:00 2001 From: Patrick More Date: Fri, 30 Apr 2021 22:18:29 -0700 Subject: [PATCH 3/8] Corrected macro call --- datafusion/src/scalar.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 2572d61a167d..6f03194f4542 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -345,7 +345,7 @@ impl ScalarValue { ), }, ScalarValue::List(values, data_type) => Arc::new(match data_type { - DataType::Boolean => typed_cast!(array, index, BooleanArray, Boolean), + DataType::Boolean => build_list!(BooleanBuilder, Boolean, values, size), DataType::Int8 => build_list!(Int8Builder, Int8, values, size), DataType::Int16 => build_list!(Int16Builder, Int16, values, size), DataType::Int32 => build_list!(Int32Builder, Int32, values, size), From 6e3ab0516959f8234f47e5c411617cc2c8892c8b Mon Sep 17 00:00:00 2001 From: Patrick More Date: Sat, 1 May 2021 11:40:50 -0700 Subject: [PATCH 4/8] Added test for boolean COUNT DISTINCT --- .../src/physical_plan/distinct_expressions.rs | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index 8534e9c8805c..72115a5d825d 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -195,7 +195,7 @@ impl Accumulator for DistinctCountAccumulator { mod tests { use super::*; - use arrow::array::ArrayRef; + use arrow::{array::{ArrayRef, BooleanArray}, ipc::Bool}; use arrow::array::{ Int16Array, Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, @@ -396,6 +396,41 @@ mod tests { test_count_distinct_update_batch_numeric!(UInt64Array, UInt64, u64) } + #[test] + fn count_distinct_update_batch_boolean()->Result<()>{ + + + + let get_count = |data: BooleanArray|->Result<(Vec>, u64)>{ + let arrays = vec![Arc::new(data) as ArrayRef]; + let (states, result) = run_update_batch(&arrays)?; + let mut state_vec = + state_to_vec!(&states[0], Boolean, bool).unwrap(); + state_vec.sort(); + let count = match result{ + ScalarValue::UInt64(c)=>c.ok_or(DataFusionError::Internal(format!("Found None count"))), + scalar => Err(DataFusionError::Internal(format!("Found non Uint64 scalar value from count: {}", scalar))) + }?; + Ok((state_vec, count)) + }; + + let zero_count_values = BooleanArray::from(Vec::::new()); + + let one_count_values = BooleanArray::from(vec![false, false]); + let one_count_values_with_null = BooleanArray::from(vec![Some(true), Some(true), None, None]); + + let two_count_values = BooleanArray::from(vec![true, false, true, false, true]); + let two_count_values_with_null = BooleanArray::from(vec![Some(true), Some(false), None, None, Some(true), Some(false)]); + + + assert_eq!(get_count(zero_count_values)?, (Vec::>::new(), 0)); + assert_eq!(get_count(one_count_values)?, (vec![Some(false)], 1)); + assert_eq!(get_count(one_count_values_with_null)?, (vec![Some(false)], 1)); + assert_eq!(get_count(two_count_values)?, (vec![Some(false), Some(true)], 2)); + assert_eq!(get_count(two_count_values_with_null)?, (vec![Some(false), Some(true)], 2)); + Ok(()) + } + #[test] fn count_distinct_update_batch_all_nulls() -> Result<()> { let arrays = vec![Arc::new(Int32Array::from( From 154c3844851d8f79bb18c254107c0115c0844803 Mon Sep 17 00:00:00 2001 From: Patrick More Date: Sat, 1 May 2021 11:53:03 -0700 Subject: [PATCH 5/8] ran cargo fmt --- .../src/physical_plan/distinct_expressions.rs | 59 ++++++++++++------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index 72115a5d825d..135e4ec3e1c6 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -195,10 +195,9 @@ impl Accumulator for DistinctCountAccumulator { mod tests { use super::*; - use arrow::{array::{ArrayRef, BooleanArray}, ipc::Bool}; use arrow::array::{ - Int16Array, Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, - UInt32Array, UInt64Array, UInt8Array, + ArrayRef, BooleanArray, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, + UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::array::{Int32Builder, ListBuilder, UInt64Builder}; use arrow::datatypes::DataType; @@ -397,19 +396,20 @@ mod tests { } #[test] - fn count_distinct_update_batch_boolean()->Result<()>{ - - - - let get_count = |data: BooleanArray|->Result<(Vec>, u64)>{ + fn count_distinct_update_batch_boolean() -> Result<()> { + let get_count = |data: BooleanArray| -> Result<(Vec>, u64)> { let arrays = vec![Arc::new(data) as ArrayRef]; let (states, result) = run_update_batch(&arrays)?; - let mut state_vec = - state_to_vec!(&states[0], Boolean, bool).unwrap(); + let mut state_vec = state_to_vec!(&states[0], Boolean, bool).unwrap(); state_vec.sort(); - let count = match result{ - ScalarValue::UInt64(c)=>c.ok_or(DataFusionError::Internal(format!("Found None count"))), - scalar => Err(DataFusionError::Internal(format!("Found non Uint64 scalar value from count: {}", scalar))) + let count = match result { + ScalarValue::UInt64(c) => { + c.ok_or(DataFusionError::Internal(format!("Found None count"))) + } + scalar => Err(DataFusionError::Internal(format!( + "Found non Uint64 scalar value from count: {}", + scalar + ))), }?; Ok((state_vec, count)) }; @@ -417,17 +417,36 @@ mod tests { let zero_count_values = BooleanArray::from(Vec::::new()); let one_count_values = BooleanArray::from(vec![false, false]); - let one_count_values_with_null = BooleanArray::from(vec![Some(true), Some(true), None, None]); + let one_count_values_with_null = + BooleanArray::from(vec![Some(true), Some(true), None, None]); let two_count_values = BooleanArray::from(vec![true, false, true, false, true]); - let two_count_values_with_null = BooleanArray::from(vec![Some(true), Some(false), None, None, Some(true), Some(false)]); - + let two_count_values_with_null = BooleanArray::from(vec![ + Some(true), + Some(false), + None, + None, + Some(true), + Some(false), + ]); - assert_eq!(get_count(zero_count_values)?, (Vec::>::new(), 0)); + assert_eq!( + get_count(zero_count_values)?, + (Vec::>::new(), 0) + ); assert_eq!(get_count(one_count_values)?, (vec![Some(false)], 1)); - assert_eq!(get_count(one_count_values_with_null)?, (vec![Some(false)], 1)); - assert_eq!(get_count(two_count_values)?, (vec![Some(false), Some(true)], 2)); - assert_eq!(get_count(two_count_values_with_null)?, (vec![Some(false), Some(true)], 2)); + assert_eq!( + get_count(one_count_values_with_null)?, + (vec![Some(false)], 1) + ); + assert_eq!( + get_count(two_count_values)?, + (vec![Some(false), Some(true)], 2) + ); + assert_eq!( + get_count(two_count_values_with_null)?, + (vec![Some(false), Some(true)], 2) + ); Ok(()) } From 9e48830aa78d2508a2f4ab36b47bf318055b98c5 Mon Sep 17 00:00:00 2001 From: Patrick More Date: Sat, 1 May 2021 12:41:00 -0700 Subject: [PATCH 6/8] Corrected test assertion for boolean COUNT DISTINCT --- datafusion/src/physical_plan/distinct_expressions.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index 135e4ec3e1c6..64e2caddee75 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -437,7 +437,7 @@ mod tests { assert_eq!(get_count(one_count_values)?, (vec![Some(false)], 1)); assert_eq!( get_count(one_count_values_with_null)?, - (vec![Some(false)], 1) + (vec![Some(true)], 1) ); assert_eq!( get_count(two_count_values)?, From 5adea6b340f19109a336fabeb57ccb4093ae9c0c Mon Sep 17 00:00:00 2001 From: Patrick More Date: Sun, 2 May 2021 12:29:04 -0700 Subject: [PATCH 7/8] Fixed clippy warnings --- datafusion/src/physical_plan/distinct_expressions.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index 64e2caddee75..ead272dfb5da 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -404,7 +404,7 @@ mod tests { state_vec.sort(); let count = match result { ScalarValue::UInt64(c) => { - c.ok_or(DataFusionError::Internal(format!("Found None count"))) + c.ok_or_else(|| DataFusionError::Internal("Found None count".to_string())) } scalar => Err(DataFusionError::Internal(format!( "Found non Uint64 scalar value from count: {}", From d90b8a74b029111df4f17687aa46d3a270e53f92 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 3 May 2021 09:56:24 -0400 Subject: [PATCH 8/8] fix cargo fmt --- datafusion/src/physical_plan/distinct_expressions.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index ead272dfb5da..1c93b5a104d0 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -403,9 +403,9 @@ mod tests { let mut state_vec = state_to_vec!(&states[0], Boolean, bool).unwrap(); state_vec.sort(); let count = match result { - ScalarValue::UInt64(c) => { - c.ok_or_else(|| DataFusionError::Internal("Found None count".to_string())) - } + ScalarValue::UInt64(c) => c.ok_or_else(|| { + DataFusionError::Internal("Found None count".to_string()) + }), scalar => Err(DataFusionError::Internal(format!( "Found non Uint64 scalar value from count: {}", scalar