-
Notifications
You must be signed in to change notification settings - Fork 1.7k
feat: convert_array_to_scalar_vec respects null elements #17891
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3246,6 +3246,8 @@ impl ScalarValue { | |
|
||
/// Retrieve ScalarValue for each row in `array` | ||
/// | ||
/// Elements in `array` may be NULL, in which case the corresponding element in the returned vector is None. | ||
/// | ||
/// Example 1: Array (ScalarValue::Int32) | ||
/// ``` | ||
/// use datafusion_common::ScalarValue; | ||
|
@@ -3262,15 +3264,15 @@ impl ScalarValue { | |
/// let scalar_vec = ScalarValue::convert_array_to_scalar_vec(&list_arr).unwrap(); | ||
/// | ||
/// let expected = vec![ | ||
/// vec![ | ||
/// ScalarValue::Int32(Some(1)), | ||
/// ScalarValue::Int32(Some(2)), | ||
/// ScalarValue::Int32(Some(3)), | ||
/// ], | ||
/// vec![ | ||
/// ScalarValue::Int32(Some(4)), | ||
/// ScalarValue::Int32(Some(5)), | ||
/// ], | ||
/// Some(vec![ | ||
/// ScalarValue::Int32(Some(1)), | ||
/// ScalarValue::Int32(Some(2)), | ||
/// ScalarValue::Int32(Some(3)), | ||
/// ]), | ||
/// Some(vec![ | ||
/// ScalarValue::Int32(Some(4)), | ||
/// ScalarValue::Int32(Some(5)), | ||
/// ]), | ||
/// ]; | ||
/// | ||
/// assert_eq!(scalar_vec, expected); | ||
|
@@ -3303,28 +3305,62 @@ impl ScalarValue { | |
/// ]); | ||
/// | ||
/// let expected = vec![ | ||
/// vec![ | ||
/// Some(vec![ | ||
/// ScalarValue::List(Arc::new(l1)), | ||
/// ScalarValue::List(Arc::new(l2)), | ||
/// ], | ||
/// ]), | ||
/// ]; | ||
/// | ||
/// assert_eq!(scalar_vec, expected); | ||
/// ``` | ||
/// | ||
/// Example 3: Nullable array | ||
/// ``` | ||
/// use datafusion_common::ScalarValue; | ||
/// use arrow::array::ListArray; | ||
/// use arrow::datatypes::{DataType, Int32Type}; | ||
/// | ||
/// let list_arr = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![ | ||
/// Some(vec![Some(1), Some(2), Some(3)]), | ||
/// None, | ||
/// Some(vec![Some(4), Some(5)]) | ||
/// ]); | ||
/// | ||
/// // Convert the array into Scalar Values for each row | ||
/// let scalar_vec = ScalarValue::convert_array_to_scalar_vec(&list_arr).unwrap(); | ||
/// | ||
/// let expected = vec![ | ||
/// Some(vec![ | ||
/// ScalarValue::Int32(Some(1)), | ||
/// ScalarValue::Int32(Some(2)), | ||
/// ScalarValue::Int32(Some(3)), | ||
/// ]), | ||
/// None, | ||
/// Some(vec![ | ||
/// ScalarValue::Int32(Some(4)), | ||
/// ScalarValue::Int32(Some(5)), | ||
/// ]), | ||
/// ]; | ||
/// | ||
/// assert_eq!(scalar_vec, expected); | ||
/// ``` | ||
pub fn convert_array_to_scalar_vec(array: &dyn Array) -> Result<Vec<Vec<Self>>> { | ||
pub fn convert_array_to_scalar_vec( | ||
array: &dyn Array, | ||
) -> Result<Vec<Option<Vec<Self>>>> { | ||
fn generic_collect<OffsetSize: OffsetSizeTrait>( | ||
array: &dyn Array, | ||
) -> Result<Vec<Vec<ScalarValue>>> { | ||
) -> Result<Vec<Option<Vec<ScalarValue>>>> { | ||
array | ||
.as_list::<OffsetSize>() | ||
.iter() | ||
.map(|nested_array| match nested_array { | ||
Some(nested_array) => (0..nested_array.len()) | ||
.map(|i| ScalarValue::try_from_array(&nested_array, i)) | ||
.collect::<Result<Vec<_>>>(), | ||
// TODO: what can we put for null? | ||
// https://github.com/apache/datafusion/issues/17749 | ||
None => Ok(vec![]), | ||
.map(|nested_array| { | ||
nested_array | ||
.map(|array| { | ||
(0..array.len()) | ||
.map(|i| ScalarValue::try_from_array(&array, i)) | ||
.collect::<Result<Vec<_>>>() | ||
}) | ||
.transpose() | ||
}) | ||
.collect() | ||
} | ||
|
@@ -9021,7 +9057,7 @@ mod tests { | |
|
||
#[test] | ||
fn test_convert_array_to_scalar_vec() { | ||
// Regular ListArray | ||
// 1: Regular ListArray | ||
let list = ListArray::from_iter_primitive::<Int64Type, _, _>(vec![ | ||
Some(vec![Some(1), Some(2)]), | ||
None, | ||
|
@@ -9031,17 +9067,20 @@ mod tests { | |
assert_eq!( | ||
converted, | ||
vec![ | ||
vec![ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(2))], | ||
vec![], | ||
vec![ | ||
Some(vec![ | ||
ScalarValue::Int64(Some(1)), | ||
ScalarValue::Int64(Some(2)) | ||
]), | ||
None, | ||
Some(vec![ | ||
ScalarValue::Int64(Some(3)), | ||
ScalarValue::Int64(None), | ||
ScalarValue::Int64(Some(4)) | ||
], | ||
]), | ||
] | ||
); | ||
|
||
// Regular LargeListArray | ||
// 2: Regular LargeListArray | ||
let large_list = LargeListArray::from_iter_primitive::<Int64Type, _, _>(vec![ | ||
Some(vec![Some(1), Some(2)]), | ||
None, | ||
|
@@ -9051,17 +9090,20 @@ mod tests { | |
assert_eq!( | ||
converted, | ||
vec![ | ||
vec![ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(2))], | ||
vec![], | ||
vec![ | ||
Some(vec![ | ||
ScalarValue::Int64(Some(1)), | ||
ScalarValue::Int64(Some(2)) | ||
]), | ||
None, | ||
Some(vec![ | ||
ScalarValue::Int64(Some(3)), | ||
ScalarValue::Int64(None), | ||
ScalarValue::Int64(Some(4)) | ||
], | ||
]), | ||
] | ||
); | ||
|
||
// Funky (null slot has non-zero list offsets) | ||
// 3: Funky (null slot has non-zero list offsets) | ||
// Offsets + Values looks like this: [[1, 2], [3, 4], [5]] | ||
// But with NullBuffer it's like this: [[1, 2], NULL, [5]] | ||
let funky = ListArray::new( | ||
|
@@ -9074,9 +9116,63 @@ mod tests { | |
assert_eq!( | ||
converted, | ||
vec![ | ||
vec![ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(2))], | ||
vec![], | ||
vec![ScalarValue::Int64(Some(5))], | ||
Some(vec![ | ||
ScalarValue::Int64(Some(1)), | ||
ScalarValue::Int64(Some(2)) | ||
]), | ||
None, | ||
Some(vec![ScalarValue::Int64(Some(5))]), | ||
] | ||
); | ||
|
||
// 4: Offsets + Values looks like this: [[1, 2], [], [5]] | ||
// But with NullBuffer it's like this: [[1, 2], NULL, [5]] | ||
// The converted result is: [[1, 2], None, [5]] | ||
let array4 = ListArray::new( | ||
Field::new_list_field(DataType::Int64, true).into(), | ||
OffsetBuffer::new(vec![0, 2, 2, 5].into()), | ||
Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5, 6])), | ||
Some(NullBuffer::from(vec![true, false, true])), | ||
); | ||
let converted = ScalarValue::convert_array_to_scalar_vec(&array4).unwrap(); | ||
assert_eq!( | ||
converted, | ||
vec![ | ||
Some(vec![ | ||
ScalarValue::Int64(Some(1)), | ||
ScalarValue::Int64(Some(2)) | ||
]), | ||
None, | ||
Some(vec![ | ||
ScalarValue::Int64(Some(3)), | ||
ScalarValue::Int64(Some(4)), | ||
ScalarValue::Int64(Some(5)), | ||
]), | ||
] | ||
); | ||
|
||
// 5: Offsets + Values looks like this: [[1, 2], [], [5]] | ||
// Same as 4, but the middle array is not null, so after conversion it's empty. | ||
let array5 = ListArray::new( | ||
Field::new_list_field(DataType::Int64, true).into(), | ||
OffsetBuffer::new(vec![0, 2, 2, 5].into()), | ||
Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5, 6])), | ||
Some(NullBuffer::from(vec![true, true, true])), | ||
); | ||
let converted = ScalarValue::convert_array_to_scalar_vec(&array5).unwrap(); | ||
assert_eq!( | ||
converted, | ||
vec![ | ||
Some(vec![ | ||
ScalarValue::Int64(Some(1)), | ||
ScalarValue::Int64(Some(2)) | ||
]), | ||
Some(vec![]), | ||
Some(vec![ | ||
ScalarValue::Int64(Some(3)), | ||
ScalarValue::Int64(Some(4)), | ||
ScalarValue::Int64(Some(5)), | ||
]), | ||
] | ||
Comment on lines
+9154
to
9176
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On main the middle element here would also be an empty vec, not |
||
); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -87,7 +87,7 @@ impl PartialOrd for CustomElement<'_> { | |
|
||
/// This functions merges `values` array (`&[Vec<ScalarValue>]`) into single array `Vec<ScalarValue>` | ||
/// Merging done according to ordering values stored inside `ordering_values` (`&[Vec<Vec<ScalarValue>>]`) | ||
/// Inner `Vec<ScalarValue>` in the `ordering_values` can be thought as ordering information for the | ||
/// Inner `Vec<ScalarValue>` in the `ordering_values` can be thought as ordering information for | ||
/// each `ScalarValue` in the `values` array. | ||
/// Desired ordering specified by `sort_options` argument (Should have same size with inner `Vec<ScalarValue>` | ||
/// of the `ordering_values` array). | ||
|
@@ -119,17 +119,25 @@ pub fn merge_ordered_arrays( | |
// Defines according to which ordering comparisons should be done. | ||
sort_options: &[SortOptions], | ||
) -> datafusion_common::Result<(Vec<ScalarValue>, Vec<Vec<ScalarValue>>)> { | ||
// Keep track the most recent data of each branch, in binary heap data structure. | ||
// Keep track of the most recent data of each branch, in a binary heap data structure. | ||
let mut heap = BinaryHeap::<CustomElement>::new(); | ||
|
||
if values.len() != ordering_values.len() | ||
|| values | ||
.iter() | ||
.zip(ordering_values.iter()) | ||
.any(|(vals, ordering_vals)| vals.len() != ordering_vals.len()) | ||
if values.len() != ordering_values.len() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't exactly feel strongly about these changes, but since these two conditions are different I liked the error message telling me which was the case. |
||
return exec_err!( | ||
"Expects values and ordering_values to have same size but got {} and {}", | ||
values.len(), | ||
ordering_values.len() | ||
); | ||
} | ||
if values | ||
.iter() | ||
.zip(ordering_values.iter()) | ||
.any(|(vals, ordering_vals)| vals.len() != ordering_vals.len()) | ||
{ | ||
return exec_err!( | ||
"Expects values arguments and/or ordering_values arguments to have same size" | ||
"Expects values elements and ordering_values elements to have same size but got {} and {}", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice fix; is it worth including index too? |
||
values.len(), | ||
ordering_values.len() | ||
); | ||
} | ||
let n_branch = values.len(); | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -687,13 +687,16 @@ impl Accumulator for OrderSensitiveArrayAggAccumulator { | |||||||||||||||||||
|
||||||||||||||||||||
// Convert array to Scalars to sort them easily. Convert back to array at evaluation. | ||||||||||||||||||||
let array_agg_res = ScalarValue::convert_array_to_scalar_vec(array_agg_values)?; | ||||||||||||||||||||
for v in array_agg_res.into_iter() { | ||||||||||||||||||||
partition_values.push(v.into()); | ||||||||||||||||||||
for maybe_v in array_agg_res.into_iter() { | ||||||||||||||||||||
if let Some(v) = maybe_v { | ||||||||||||||||||||
partition_values.push(v.into()); | ||||||||||||||||||||
} else { | ||||||||||||||||||||
partition_values.push(vec![].into()); | ||||||||||||||||||||
} | ||||||||||||||||||||
Comment on lines
+693
to
+695
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure about this one! With the other changes, the group_by slt tests fail without this change. It is because in datafusion/datafusion/functions-aggregate-common/src/merge_arrays.rs Lines 125 to 133 in 71512e6
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's also an exact same change done in |
||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
let orderings = ScalarValue::convert_array_to_scalar_vec(agg_orderings)?; | ||||||||||||||||||||
|
||||||||||||||||||||
for partition_ordering_rows in orderings.into_iter() { | ||||||||||||||||||||
for partition_ordering_rows in orderings.into_iter().flatten() { | ||||||||||||||||||||
// Extract value from struct to ordering_rows for each group/partition | ||||||||||||||||||||
let ordering_value = partition_ordering_rows.into_iter().map(|ordering_row| { | ||||||||||||||||||||
if let ScalarValue::Struct(s) = ordering_row { | ||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On main the middle element here would be an empty vec, not
None
.