-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Support number of centroids in approx_percentile_cont #3146
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,6 +40,7 @@ pub struct ApproxPercentileCont { | |
| input_data_type: DataType, | ||
| expr: Vec<Arc<dyn PhysicalExpr>>, | ||
| percentile: f64, | ||
| tdigest_max_size: Option<usize>, | ||
| } | ||
|
|
||
| impl ApproxPercentileCont { | ||
|
|
@@ -52,39 +53,35 @@ impl ApproxPercentileCont { | |
| // Arguments should be [ColumnExpr, DesiredPercentileLiteral] | ||
| debug_assert_eq!(expr.len(), 2); | ||
|
|
||
| // Extract the desired percentile literal | ||
| let lit = expr[1] | ||
| .as_any() | ||
| .downcast_ref::<Literal>() | ||
| .ok_or_else(|| { | ||
| DataFusionError::Internal( | ||
| "desired percentile argument must be float literal".to_string(), | ||
| ) | ||
| })? | ||
| .value(); | ||
| let percentile = match lit { | ||
| ScalarValue::Float32(Some(q)) => *q as f64, | ||
| ScalarValue::Float64(Some(q)) => *q as f64, | ||
| got => return Err(DataFusionError::NotImplemented(format!( | ||
| "Percentile value for 'APPROX_PERCENTILE_CONT' must be Float32 or Float64 literal (got data type {})", | ||
| got | ||
| ))) | ||
| }; | ||
| let percentile = validate_input_percentile_expr(&expr[1])?; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
|
|
||
| // Ensure the percentile is between 0 and 1. | ||
| if !(0.0..=1.0).contains(&percentile) { | ||
| return Err(DataFusionError::Plan(format!( | ||
| "Percentile value must be between 0.0 and 1.0 inclusive, {} is invalid", | ||
| percentile | ||
| ))); | ||
| } | ||
| Ok(Self { | ||
| name: name.into(), | ||
| input_data_type, | ||
| // The physical expr to evaluate during accumulation | ||
| expr, | ||
| percentile, | ||
| tdigest_max_size: None, | ||
| }) | ||
| } | ||
|
|
||
| /// Create a new [`ApproxPercentileCont`] aggregate function. | ||
| pub fn new_with_max_size( | ||
| expr: Vec<Arc<dyn PhysicalExpr>>, | ||
| name: impl Into<String>, | ||
| input_data_type: DataType, | ||
| ) -> Result<Self> { | ||
| // Arguments should be [ColumnExpr, DesiredPercentileLiteral, TDigestMaxSize] | ||
| debug_assert_eq!(expr.len(), 3); | ||
| let percentile = validate_input_percentile_expr(&expr[1])?; | ||
| let max_size = validate_input_max_size_expr(&expr[2])?; | ||
| Ok(Self { | ||
| name: name.into(), | ||
| input_data_type, | ||
| // The physical expr to evaluate during accumulation | ||
| expr, | ||
| percentile, | ||
| tdigest_max_size: Some(max_size), | ||
| }) | ||
| } | ||
|
|
||
|
|
@@ -100,7 +97,13 @@ impl ApproxPercentileCont { | |
| | DataType::Int64 | ||
| | DataType::Float32 | ||
| | DataType::Float64) => { | ||
| ApproxPercentileAccumulator::new(self.percentile, t.clone()) | ||
| if let Some(max_size) = self.tdigest_max_size { | ||
| ApproxPercentileAccumulator::new_with_max_size(self.percentile, t.clone(), max_size) | ||
|
|
||
| }else{ | ||
| ApproxPercentileAccumulator::new(self.percentile, t.clone()) | ||
|
|
||
| } | ||
| } | ||
| other => { | ||
| return Err(DataFusionError::NotImplemented(format!( | ||
|
|
@@ -113,6 +116,64 @@ impl ApproxPercentileCont { | |
| } | ||
| } | ||
|
|
||
| fn validate_input_percentile_expr(expr: &Arc<dyn PhysicalExpr>) -> Result<f64> { | ||
| // Extract the desired percentile literal | ||
| let lit = expr | ||
| .as_any() | ||
| .downcast_ref::<Literal>() | ||
| .ok_or_else(|| { | ||
| DataFusionError::Internal( | ||
| "desired percentile argument must be float literal".to_string(), | ||
| ) | ||
| })? | ||
| .value(); | ||
| let percentile = match lit { | ||
| ScalarValue::Float32(Some(q)) => *q as f64, | ||
| ScalarValue::Float64(Some(q)) => *q as f64, | ||
| got => return Err(DataFusionError::NotImplemented(format!( | ||
| "Percentile value for 'APPROX_PERCENTILE_CONT' must be Float32 or Float64 literal (got data type {})", | ||
| got.get_datatype() | ||
| ))) | ||
| }; | ||
|
|
||
| // Ensure the percentile is between 0 and 1. | ||
| if !(0.0..=1.0).contains(&percentile) { | ||
| return Err(DataFusionError::Plan(format!( | ||
| "Percentile value must be between 0.0 and 1.0 inclusive, {} is invalid", | ||
| percentile | ||
| ))); | ||
| } | ||
| Ok(percentile) | ||
| } | ||
|
|
||
| fn validate_input_max_size_expr(expr: &Arc<dyn PhysicalExpr>) -> Result<usize> { | ||
| // Extract the desired percentile literal | ||
| let lit = expr | ||
| .as_any() | ||
| .downcast_ref::<Literal>() | ||
| .ok_or_else(|| { | ||
| DataFusionError::Internal( | ||
| "desired percentile argument must be float literal".to_string(), | ||
| ) | ||
| })? | ||
| .value(); | ||
| let max_size = match lit { | ||
| ScalarValue::UInt8(Some(q)) => *q as usize, | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For now i think we can not input unsigned int from sql 🤔
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I think we should file a ticket to allow creating unsigned int types from SQL (like in I ran into this limitation while I was fixing #3167 -- https://github.com/apache/arrow-datafusion/pull/3167/files#r946145318 |
||
| ScalarValue::UInt16(Some(q)) => *q as usize, | ||
| ScalarValue::UInt32(Some(q)) => *q as usize, | ||
| ScalarValue::UInt64(Some(q)) => *q as usize, | ||
| ScalarValue::Int32(Some(q)) if *q > 0 => *q as usize, | ||
| ScalarValue::Int64(Some(q)) if *q > 0 => *q as usize, | ||
| ScalarValue::Int16(Some(q)) if *q > 0 => *q as usize, | ||
| ScalarValue::Int8(Some(q)) if *q > 0 => *q as usize, | ||
| got => return Err(DataFusionError::NotImplemented(format!( | ||
| "Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal (got data type {}).", | ||
| got.get_datatype() | ||
| ))) | ||
| }; | ||
| Ok(max_size) | ||
| } | ||
|
|
||
| impl AggregateExpr for ApproxPercentileCont { | ||
| fn as_any(&self) -> &dyn Any { | ||
| self | ||
|
|
@@ -190,6 +251,18 @@ impl ApproxPercentileAccumulator { | |
| } | ||
| } | ||
|
|
||
| pub fn new_with_max_size( | ||
| percentile: f64, | ||
| return_type: DataType, | ||
| max_size: usize, | ||
| ) -> Self { | ||
| Self { | ||
| digest: TDigest::new(max_size), | ||
| percentile, | ||
| return_type, | ||
| } | ||
| } | ||
|
|
||
| pub(crate) fn merge_digests(&mut self, digests: &[TDigest]) { | ||
| self.digest = TDigest::merge_digests(digests); | ||
| } | ||
|
|
@@ -285,7 +358,6 @@ impl ApproxPercentileAccumulator { | |
| } | ||
| } | ||
| } | ||
|
|
||
| impl Accumulator for ApproxPercentileAccumulator { | ||
| fn state(&self) -> Result<Vec<AggregateState>> { | ||
| Ok(self | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not a statistics expert but I verified that this is different than the output generated by
SELECT c1, approx_percentile_cont_with_weight(c3, c2, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1a few lines aboveThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@alamb Thanks for your review 😊
this sql should have with the same result as
SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1in line 867