-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Optimize date_trunc function by avoiding allocations
#18360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,8 +28,9 @@ use arrow::array::types::{ | |
| ArrowTimestampType, TimestampMicrosecondType, TimestampMillisecondType, | ||
| TimestampNanosecondType, TimestampSecondType, | ||
| }; | ||
| use arrow::array::{Array, ArrayRef, Int64Array, PrimitiveArray}; | ||
| use arrow::array::{Array, ArrayRef, AsArray, PrimitiveArray}; | ||
| use arrow::datatypes::DataType::{self, Null, Timestamp, Utf8, Utf8View}; | ||
| use arrow::datatypes::Int64Type; | ||
| use arrow::datatypes::TimeUnit::{self, Microsecond, Millisecond, Nanosecond, Second}; | ||
| use datafusion_common::cast::as_primitive_array; | ||
| use datafusion_common::{ | ||
|
|
@@ -455,36 +456,57 @@ fn general_date_trunc_array_fine_granularity<T: ArrowTimestampType>( | |
| array: &PrimitiveArray<T>, | ||
| granularity: &str, | ||
| ) -> Result<ArrayRef> { | ||
| let unit = match (tu, granularity) { | ||
| (Second, "minute") => Some(Int64Array::new_scalar(60)), | ||
| (Second, "hour") => Some(Int64Array::new_scalar(3600)), | ||
| (Second, "day") => Some(Int64Array::new_scalar(86400)), | ||
|
|
||
| (Millisecond, "second") => Some(Int64Array::new_scalar(1_000)), | ||
| (Millisecond, "minute") => Some(Int64Array::new_scalar(60_000)), | ||
| (Millisecond, "hour") => Some(Int64Array::new_scalar(3_600_000)), | ||
| (Millisecond, "day") => Some(Int64Array::new_scalar(86_400_000)), | ||
|
|
||
| (Microsecond, "millisecond") => Some(Int64Array::new_scalar(1_000)), | ||
| (Microsecond, "second") => Some(Int64Array::new_scalar(1_000_000)), | ||
| (Microsecond, "minute") => Some(Int64Array::new_scalar(60_000_000)), | ||
| (Microsecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000)), | ||
| (Microsecond, "day") => Some(Int64Array::new_scalar(86_400_000_000)), | ||
|
|
||
| (Nanosecond, "microsecond") => Some(Int64Array::new_scalar(1_000)), | ||
| (Nanosecond, "millisecond") => Some(Int64Array::new_scalar(1_000_000)), | ||
| (Nanosecond, "second") => Some(Int64Array::new_scalar(1_000_000_000)), | ||
| (Nanosecond, "minute") => Some(Int64Array::new_scalar(60_000_000_000)), | ||
| (Nanosecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000_000)), | ||
| (Nanosecond, "day") => Some(Int64Array::new_scalar(86_400_000_000_000)), | ||
| let unit: Option<i64> = match (tu, granularity) { | ||
| (Second, "minute") => Some(60), | ||
| (Second, "hour") => Some(3600), | ||
| (Second, "day") => Some(86400), | ||
|
|
||
| (Millisecond, "second") => Some(1_000), | ||
| (Millisecond, "minute") => Some(60_000), | ||
| (Millisecond, "hour") => Some(3_600_000), | ||
| (Millisecond, "day") => Some(86_400_000), | ||
|
|
||
| (Microsecond, "millisecond") => Some(1_000), | ||
| (Microsecond, "second") => Some(1_000_000), | ||
| (Microsecond, "minute") => Some(60_000_000), | ||
| (Microsecond, "hour") => Some(3_600_000_000), | ||
| (Microsecond, "day") => Some(86_400_000_000), | ||
|
|
||
| (Nanosecond, "microsecond") => Some(1_000), | ||
| (Nanosecond, "millisecond") => Some(1_000_000), | ||
| (Nanosecond, "second") => Some(1_000_000_000), | ||
| (Nanosecond, "minute") => Some(60_000_000_000), | ||
| (Nanosecond, "hour") => Some(3_600_000_000_000), | ||
| (Nanosecond, "day") => Some(86_400_000_000_000), | ||
| _ => None, | ||
| }; | ||
|
|
||
| if let Some(unit) = unit { | ||
| let original_type = array.data_type(); | ||
| let array = arrow::compute::cast(array, &DataType::Int64)?; | ||
| let array = arrow::compute::kernels::numeric::div(&array, &unit)?; | ||
| let array = arrow::compute::kernels::numeric::mul(&array, &unit)?; | ||
| let input = arrow::compute::cast(array, &DataType::Int64)?; | ||
| // Optimize performance by doing operations in place if possible | ||
| let array = input.as_primitive::<Int64Type>().clone(); | ||
| drop(input); // ensure the input reference is dropped (so we can reuse the memory if possible) | ||
| let array = try_unary_mut_or_clone(array, |i| { | ||
| i.checked_div(unit) | ||
| .ok_or_else(|| exec_datafusion_err!("division overflow")) | ||
| })?; | ||
| let array = try_unary_mut_or_clone(array, |i| { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. technically speaking, only the first
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is true, though I don't know how to represent this in code. Maybe I could make a second function |
||
| i.checked_mul(unit) | ||
| .ok_or_else(|| exec_datafusion_err!("multiplication overflow")) | ||
| })?; | ||
| let array = try_unary_mut_or_clone(array, |i| { | ||
| // For timestamps before 1970-01-01T00:00:00Z (negative values) | ||
| // it is possible that the truncated value is actually later | ||
| // than the original value. Correct any such cases by | ||
| // subtracting `unit`. | ||
| if i > 0 { | ||
| Ok(i) | ||
| } else { | ||
| i.checked_sub(unit) | ||
| .ok_or_else(|| exec_datafusion_err!("subtraction overflow")) | ||
| } | ||
| })?; | ||
| let array = arrow::compute::cast(&array, original_type)?; | ||
| Ok(array) | ||
| } else { | ||
|
|
@@ -493,6 +515,21 @@ fn general_date_trunc_array_fine_granularity<T: ArrowTimestampType>( | |
| } | ||
| } | ||
|
|
||
| /// Applies the unary operation in place if possible, or cloning the array if not | ||
| fn try_unary_mut_or_clone<F>( | ||
| array: PrimitiveArray<Int64Type>, | ||
| op: F, | ||
| ) -> Result<PrimitiveArray<Int64Type>> | ||
| where | ||
| F: Fn(i64) -> Result<i64>, | ||
|
Comment on lines
+519
to
+524
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not really date_trunc specific. can this be made more flexible with a more generous use of generics?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I agree -- the try_unary_mut is quite awkward to use. I will see if I can port some of these changes upstream / see what they look like
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| { | ||
| match array.try_unary_mut(&op) { | ||
| Ok(result) => result, | ||
| // on error, make a new array | ||
| Err(array) => array.try_unary(op), | ||
| } | ||
| } | ||
|
|
||
| // truncates a single value with the given timeunit to the specified granularity | ||
| fn general_date_trunc( | ||
| tu: TimeUnit, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The key idea here is to make this code faster by reusing the allocation and operating in place rather than allocating new arrays