diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs index feffb11bf700..4a33334770a0 100644 --- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs @@ -45,8 +45,7 @@ use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use test_utils::add_empty_batches; use datafusion::functions_window::row_number::row_number_udwf; -use datafusion_functions_window::dense_rank::dense_rank_udwf; -use datafusion_functions_window::rank::rank_udwf; +use datafusion_functions_window::rank::{dense_rank_udwf, rank_udwf}; use hashbrown::HashMap; use rand::distributions::Alphanumeric; use rand::rngs::StdRng; diff --git a/datafusion/functions-window/src/dense_rank.rs b/datafusion/functions-window/src/dense_rank.rs deleted file mode 100644 index c969a7c46fc6..000000000000 --- a/datafusion/functions-window/src/dense_rank.rs +++ /dev/null @@ -1,205 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! `dense_rank` window function implementation - -use std::any::Any; -use std::fmt::Debug; -use std::iter; -use std::ops::Range; -use std::sync::Arc; - -use crate::define_udwf_and_expr; -use crate::rank::RankState; -use datafusion_common::arrow::array::ArrayRef; -use datafusion_common::arrow::array::UInt64Array; -use datafusion_common::arrow::compute::SortOptions; -use datafusion_common::arrow::datatypes::DataType; -use datafusion_common::arrow::datatypes::Field; -use datafusion_common::utils::get_row_at_idx; -use datafusion_common::{Result, ScalarValue}; -use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl}; -use datafusion_functions_window_common::field; -use datafusion_functions_window_common::partition::PartitionEvaluatorArgs; -use field::WindowUDFFieldArgs; - -define_udwf_and_expr!( - DenseRank, - dense_rank, - "Returns rank of the current row without gaps. This function counts peer groups" -); - -/// dense_rank expression -#[derive(Debug)] -pub struct DenseRank { - signature: Signature, -} - -impl DenseRank { - /// Create a new `dense_rank` function - pub fn new() -> Self { - Self { - signature: Signature::any(0, Volatility::Immutable), - } - } -} - -impl Default for DenseRank { - fn default() -> Self { - Self::new() - } -} - -impl WindowUDFImpl for DenseRank { - fn as_any(&self) -> &dyn Any { - self - } - - fn name(&self) -> &str { - "dense_rank" - } - - fn signature(&self) -> &Signature { - &self.signature - } - - fn partition_evaluator( - &self, - _partition_evaluator_args: PartitionEvaluatorArgs, - ) -> Result> { - Ok(Box::::default()) - } - - fn field(&self, field_args: WindowUDFFieldArgs) -> Result { - Ok(Field::new(field_args.name(), DataType::UInt64, false)) - } - - fn sort_options(&self) -> Option { - Some(SortOptions { - descending: false, - nulls_first: false, - }) - } -} - -/// State for the `dense_rank` built-in window function. -#[derive(Debug, Default)] -struct DenseRankEvaluator { - state: RankState, -} - -impl PartitionEvaluator for DenseRankEvaluator { - fn is_causal(&self) -> bool { - // The dense_rank function doesn't need "future" values to emit results: - true - } - - fn evaluate( - &mut self, - values: &[ArrayRef], - range: &Range, - ) -> Result { - let row_idx = range.start; - // There is no argument, values are order by column values (where rank is calculated) - let range_columns = values; - let last_rank_data = get_row_at_idx(range_columns, row_idx)?; - let new_rank_encountered = - if let Some(state_last_rank_data) = &self.state.last_rank_data { - // if rank data changes, new rank is encountered - state_last_rank_data != &last_rank_data - } else { - // First rank seen - true - }; - - if new_rank_encountered { - self.state.last_rank_data = Some(last_rank_data); - self.state.last_rank_boundary += self.state.current_group_count; - self.state.current_group_count = 1; - self.state.n_rank += 1; - } else { - // data is still in the same rank - self.state.current_group_count += 1; - } - - Ok(ScalarValue::UInt64(Some(self.state.n_rank as u64))) - } - - fn evaluate_all_with_rank( - &self, - _num_rows: usize, - ranks_in_partition: &[Range], - ) -> Result { - let result = Arc::new(UInt64Array::from_iter_values( - ranks_in_partition - .iter() - .zip(1u64..) - .flat_map(|(range, rank)| { - let len = range.end - range.start; - iter::repeat(rank).take(len) - }), - )); - - Ok(result) - } - - fn supports_bounded_execution(&self) -> bool { - true - } - - fn include_rank(&self) -> bool { - true - } -} - -#[cfg(test)] -mod tests { - use super::*; - use datafusion_common::cast::as_uint64_array; - - fn test_with_rank(expr: &DenseRank, expected: Vec) -> Result<()> { - test_i32_result(expr, vec![0..2, 2..3, 3..6, 6..7, 7..8], expected) - } - - #[allow(clippy::single_range_in_vec_init)] - fn test_without_rank(expr: &DenseRank, expected: Vec) -> Result<()> { - test_i32_result(expr, vec![0..8], expected) - } - - fn test_i32_result( - expr: &DenseRank, - ranks: Vec>, - expected: Vec, - ) -> Result<()> { - let args = PartitionEvaluatorArgs::default(); - let result = expr - .partition_evaluator(args)? - .evaluate_all_with_rank(8, &ranks)?; - let result = as_uint64_array(&result)?; - let result = result.values(); - assert_eq!(expected, *result); - Ok(()) - } - - #[test] - fn test_dense_rank() -> Result<()> { - let r = DenseRank::default(); - test_without_rank(&r, vec![1; 8])?; - test_with_rank(&r, vec![1, 1, 2, 3, 3, 3, 4, 5])?; - Ok(()) - } -} diff --git a/datafusion/functions-window/src/lib.rs b/datafusion/functions-window/src/lib.rs index b72780990841..ef624e13e61c 100644 --- a/datafusion/functions-window/src/lib.rs +++ b/datafusion/functions-window/src/lib.rs @@ -31,16 +31,12 @@ use datafusion_expr::WindowUDF; #[macro_use] pub mod macros; -pub mod dense_rank; -pub mod percent_rank; pub mod rank; pub mod row_number; /// Fluent-style API for creating `Expr`s pub mod expr_fn { - pub use super::dense_rank::dense_rank; - pub use super::percent_rank::percent_rank; - pub use super::rank::rank; + pub use super::rank::{dense_rank, percent_rank, rank}; pub use super::row_number::row_number; } @@ -49,8 +45,8 @@ pub fn all_default_window_functions() -> Vec> { vec![ row_number::row_number_udwf(), rank::rank_udwf(), - dense_rank::dense_rank_udwf(), - percent_rank::percent_rank_udwf(), + rank::dense_rank_udwf(), + rank::percent_rank_udwf(), ] } /// Registers all enabled packages with a [`FunctionRegistry`] diff --git a/datafusion/functions-window/src/macros.rs b/datafusion/functions-window/src/macros.rs index e934f883b101..2905ccf4c204 100644 --- a/datafusion/functions-window/src/macros.rs +++ b/datafusion/functions-window/src/macros.rs @@ -303,7 +303,7 @@ macro_rules! create_udwf_expr { ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr) => { paste::paste! { #[doc = " Create a [`WindowFunction`](datafusion_expr::Expr::WindowFunction) expression for"] - #[doc = concat!(" [`", stringify!($UDWF), "`] user-defined window function.")] + #[doc = concat!(" `", stringify!($UDWF), "` user-defined window function.")] #[doc = ""] #[doc = concat!(" ", $DOC)] pub fn $OUT_FN_NAME() -> datafusion_expr::Expr { @@ -316,7 +316,7 @@ macro_rules! create_udwf_expr { ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $DOC:expr) => { paste::paste! { #[doc = " Create a [`WindowFunction`](datafusion_expr::Expr::WindowFunction) expression for"] - #[doc = concat!(" [`", stringify!($UDWF), "`] user-defined window function.")] + #[doc = concat!(" `", stringify!($UDWF), "` user-defined window function.")] #[doc = ""] #[doc = concat!(" ", $DOC)] pub fn $OUT_FN_NAME( diff --git a/datafusion/functions-window/src/percent_rank.rs b/datafusion/functions-window/src/percent_rank.rs deleted file mode 100644 index 147959f69be9..000000000000 --- a/datafusion/functions-window/src/percent_rank.rs +++ /dev/null @@ -1,192 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! `percent_rank` window function implementation - -use std::any::Any; -use std::fmt::Debug; -use std::iter; -use std::ops::Range; -use std::sync::Arc; - -use crate::define_udwf_and_expr; -use datafusion_common::arrow::array::ArrayRef; -use datafusion_common::arrow::array::Float64Array; -use datafusion_common::arrow::compute::SortOptions; -use datafusion_common::arrow::datatypes::DataType; -use datafusion_common::arrow::datatypes::Field; -use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl}; -use datafusion_functions_window_common::field; -use datafusion_functions_window_common::partition::PartitionEvaluatorArgs; -use field::WindowUDFFieldArgs; - -define_udwf_and_expr!( - PercentRank, - percent_rank, - "Returns the relative rank of the current row: (rank - 1) / (total rows - 1)" -); - -/// percent_rank expression -#[derive(Debug)] -pub struct PercentRank { - signature: Signature, -} - -impl PercentRank { - /// Create a new `percent_rank` function - pub fn new() -> Self { - Self { - signature: Signature::any(0, Volatility::Immutable), - } - } -} - -impl Default for PercentRank { - fn default() -> Self { - Self::new() - } -} - -impl WindowUDFImpl for PercentRank { - fn as_any(&self) -> &dyn Any { - self - } - - fn name(&self) -> &str { - "percent_rank" - } - - fn signature(&self) -> &Signature { - &self.signature - } - - fn partition_evaluator( - &self, - _partition_evaluator_args: PartitionEvaluatorArgs, - ) -> Result> { - Ok(Box::::default()) - } - - fn field(&self, field_args: WindowUDFFieldArgs) -> Result { - Ok(Field::new(field_args.name(), DataType::Float64, false)) - } - - fn sort_options(&self) -> Option { - Some(SortOptions { - descending: false, - nulls_first: false, - }) - } -} - -/// State for the `percent_rank` built-in window function. -#[derive(Debug, Default)] -struct PercentRankEvaluator {} - -impl PartitionEvaluator for PercentRankEvaluator { - fn is_causal(&self) -> bool { - // The percent_rank function doesn't need "future" values to emit results: - false - } - - fn evaluate( - &mut self, - _values: &[ArrayRef], - _range: &Range, - ) -> Result { - exec_err!("Can not execute PERCENT_RANK in a streaming fashion") - } - - fn evaluate_all_with_rank( - &self, - num_rows: usize, - ranks_in_partition: &[Range], - ) -> Result { - let denominator = num_rows as f64; - let result = - // Returns the relative rank of the current row, that is (rank - 1) / (total partition rows - 1). The value thus ranges from 0 to 1 inclusive. - Arc::new(Float64Array::from_iter_values( - ranks_in_partition - .iter() - .scan(0_u64, |acc, range| { - let len = range.end - range.start; - let value = (*acc as f64) / (denominator - 1.0).max(1.0); - let result = iter::repeat(value).take(len); - *acc += len as u64; - Some(result) - }) - .flatten(), - )); - - Ok(result) - } - - fn supports_bounded_execution(&self) -> bool { - false - } - - fn include_rank(&self) -> bool { - true - } -} - -#[cfg(test)] -mod tests { - use super::*; - use datafusion_common::cast::as_float64_array; - - fn test_f64_result( - expr: &PercentRank, - num_rows: usize, - ranks: Vec>, - expected: Vec, - ) -> Result<()> { - let args = PartitionEvaluatorArgs::default(); - let result = expr - .partition_evaluator(args)? - .evaluate_all_with_rank(num_rows, &ranks)?; - let result = as_float64_array(&result)?; - let result = result.values(); - assert_eq!(expected, *result); - Ok(()) - } - - #[test] - #[allow(clippy::single_range_in_vec_init)] - fn test_percent_rank() -> Result<()> { - let r = PercentRank::default(); - - // empty case - let expected = vec![0.0; 0]; - test_f64_result(&r, 0, vec![0..0; 0], expected)?; - - // singleton case - let expected = vec![0.0]; - test_f64_result(&r, 1, vec![0..1], expected)?; - - // uniform case - let expected = vec![0.0; 7]; - test_f64_result(&r, 7, vec![0..7], expected)?; - - // non-trivial case - let expected = vec![0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5]; - test_f64_result(&r, 7, vec![0..3, 3..7], expected)?; - - Ok(()) - } -} diff --git a/datafusion/functions-window/src/rank.rs b/datafusion/functions-window/src/rank.rs index c52dec9061ba..06c3f49055a5 100644 --- a/datafusion/functions-window/src/rank.rs +++ b/datafusion/functions-window/src/rank.rs @@ -15,23 +15,27 @@ // specific language governing permissions and limitations // under the License. -//! `rank` window function implementation +//! Implementation of `rank`, `dense_rank`, and `percent_rank` window functions, +//! which can be evaluated at runtime during query execution. use std::any::Any; use std::fmt::Debug; use std::iter; use std::ops::Range; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use crate::define_udwf_and_expr; use datafusion_common::arrow::array::ArrayRef; -use datafusion_common::arrow::array::UInt64Array; +use datafusion_common::arrow::array::{Float64Array, UInt64Array}; use datafusion_common::arrow::compute::SortOptions; use datafusion_common::arrow::datatypes::DataType; use datafusion_common::arrow::datatypes::Field; use datafusion_common::utils::get_row_at_idx; -use datafusion_common::{Result, ScalarValue}; -use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl}; +use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_expr::window_doc_sections::DOC_SECTION_RANKING; +use datafusion_expr::{ + Documentation, PartitionEvaluator, Signature, Volatility, WindowUDFImpl, +}; use datafusion_functions_window_common::field; use datafusion_functions_window_common::partition::PartitionEvaluatorArgs; use field::WindowUDFFieldArgs; @@ -39,28 +43,113 @@ use field::WindowUDFFieldArgs; define_udwf_and_expr!( Rank, rank, - "Returns rank of the current row with gaps. Same as `row_number` of its first peer" + "Returns rank of the current row with gaps. Same as `row_number` of its first peer", + Rank::basic ); -/// rank expression +define_udwf_and_expr!( + DenseRank, + dense_rank, + "Returns rank of the current row without gaps. This function counts peer groups", + Rank::dense_rank +); + +define_udwf_and_expr!( + PercentRank, + percent_rank, + "Returns the relative rank of the current row: (rank - 1) / (total rows - 1)", + Rank::percent_rank +); + +/// Rank calculates the rank in the window function with order by #[derive(Debug)] pub struct Rank { + name: String, signature: Signature, + rank_type: RankType, } impl Rank { - /// Create a new `rank` function - pub fn new() -> Self { + /// Create a new `rank` function with the specified name and rank type + pub fn new(name: String, rank_type: RankType) -> Self { Self { + name, signature: Signature::any(0, Volatility::Immutable), + rank_type, } } -} -impl Default for Rank { - fn default() -> Self { - Self::new() + /// Create a `rank` window function + pub fn basic() -> Self { + Rank::new("rank".to_string(), RankType::Basic) } + + /// Create a `dense_rank` window function + pub fn dense_rank() -> Self { + Rank::new("dense_rank".to_string(), RankType::Dense) + } + + /// Create a `percent_rank` window function + pub fn percent_rank() -> Self { + Rank::new("percent_rank".to_string(), RankType::Percent) + } +} + +#[derive(Debug, Copy, Clone)] +pub enum RankType { + Basic, + Dense, + Percent, +} + +static RANK_DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_rank_doc() -> &'static Documentation { + RANK_DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_RANKING) + .with_description( + "Returns the rank of the current row within its partition, allowing \ + gaps between ranks. This function provides a ranking similar to `row_number`, but \ + skips ranks for identical values.", + ) + .with_syntax_example("rank()") + .build() + .unwrap() + }) +} + +static DENSE_RANK_DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_dense_rank_doc() -> &'static Documentation { + DENSE_RANK_DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_RANKING) + .with_description( + "Returns the rank of the current row without gaps. This function ranks \ + rows in a dense manner, meaning consecutive ranks are assigned even for identical \ + values.", + ) + .with_syntax_example("dense_rank()") + .build() + .unwrap() + }) +} + +static PERCENT_RANK_DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_percent_rank_doc() -> &'static Documentation { + PERCENT_RANK_DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_RANKING) + .with_description( + "Returns the percentage rank of the current row within its partition. \ + The value ranges from 0 to 1 and is computed as `(rank - 1) / (total_rows - 1)`.", + ) + .with_syntax_example("percent_rank()") + .build() + .unwrap() + }) } impl WindowUDFImpl for Rank { @@ -69,7 +158,7 @@ impl WindowUDFImpl for Rank { } fn name(&self) -> &str { - "rank" + &self.name } fn signature(&self) -> &Signature { @@ -80,11 +169,20 @@ impl WindowUDFImpl for Rank { &self, _partition_evaluator_args: PartitionEvaluatorArgs, ) -> Result> { - Ok(Box::::default()) + Ok(Box::new(RankEvaluator { + state: RankState::default(), + rank_type: self.rank_type, + })) } fn field(&self, field_args: WindowUDFFieldArgs) -> Result { - Ok(Field::new(field_args.name(), DataType::UInt64, false)) + let return_type = match self.rank_type { + RankType::Basic | RankType::Dense => DataType::UInt64, + RankType::Percent => DataType::Float64, + }; + + let nullable = false; + Ok(Field::new(field_args.name(), return_type, nullable)) } fn sort_options(&self) -> Option { @@ -93,6 +191,14 @@ impl WindowUDFImpl for Rank { nulls_first: false, }) } + + fn documentation(&self) -> Option<&Documentation> { + match self.rank_type { + RankType::Basic => Some(get_rank_doc()), + RankType::Dense => Some(get_dense_rank_doc()), + RankType::Percent => Some(get_percent_rank_doc()), + } + } } /// State for the RANK(rank) built-in window function. @@ -109,15 +215,15 @@ pub struct RankState { } /// State for the `rank` built-in window function. -#[derive(Debug, Default)] +#[derive(Debug)] struct RankEvaluator { state: RankState, + rank_type: RankType, } impl PartitionEvaluator for RankEvaluator { fn is_causal(&self) -> bool { - // The rank function doesn't need "future" values to emit results: - true + matches!(self.rank_type, RankType::Basic | RankType::Dense) } fn evaluate( @@ -147,33 +253,68 @@ impl PartitionEvaluator for RankEvaluator { self.state.current_group_count += 1; } - Ok(ScalarValue::UInt64(Some( - self.state.last_rank_boundary as u64 + 1, - ))) + match self.rank_type { + RankType::Basic => Ok(ScalarValue::UInt64(Some( + self.state.last_rank_boundary as u64 + 1, + ))), + RankType::Dense => Ok(ScalarValue::UInt64(Some(self.state.n_rank as u64))), + RankType::Percent => { + exec_err!("Can not execute PERCENT_RANK in a streaming fashion") + } + } } fn evaluate_all_with_rank( &self, - _num_rows: usize, + num_rows: usize, ranks_in_partition: &[Range], ) -> Result { - let result = Arc::new(UInt64Array::from_iter_values( - ranks_in_partition - .iter() - .scan(1_u64, |acc, range| { - let len = range.end - range.start; - let result = iter::repeat(*acc).take(len); - *acc += len as u64; - Some(result) - }) - .flatten(), - )); + let result: ArrayRef = match self.rank_type { + RankType::Basic => Arc::new(UInt64Array::from_iter_values( + ranks_in_partition + .iter() + .scan(1_u64, |acc, range| { + let len = range.end - range.start; + let result = iter::repeat(*acc).take(len); + *acc += len as u64; + Some(result) + }) + .flatten(), + )), + + RankType::Dense => Arc::new(UInt64Array::from_iter_values( + ranks_in_partition + .iter() + .zip(1u64..) + .flat_map(|(range, rank)| { + let len = range.end - range.start; + iter::repeat(rank).take(len) + }), + )), + + RankType::Percent => { + let denominator = num_rows as f64; + + Arc::new(Float64Array::from_iter_values( + ranks_in_partition + .iter() + .scan(0_u64, |acc, range| { + let len = range.end - range.start; + let value = (*acc as f64) / (denominator - 1.0).max(1.0); + let result = iter::repeat(value).take(len); + *acc += len as u64; + Some(result) + }) + .flatten(), + )) + } + }; Ok(result) } fn supports_bounded_execution(&self) -> bool { - true + matches!(self.rank_type, RankType::Basic | RankType::Dense) } fn include_rank(&self) -> bool { @@ -184,7 +325,7 @@ impl PartitionEvaluator for RankEvaluator { #[cfg(test)] mod tests { use super::*; - use datafusion_common::cast::as_uint64_array; + use datafusion_common::cast::{as_float64_array, as_uint64_array}; fn test_with_rank(expr: &Rank, expected: Vec) -> Result<()> { test_i32_result(expr, vec![0..2, 2..3, 3..6, 6..7, 7..8], expected) @@ -210,11 +351,59 @@ mod tests { Ok(()) } + fn test_f64_result( + expr: &Rank, + num_rows: usize, + ranks: Vec>, + expected: Vec, + ) -> Result<()> { + let args = PartitionEvaluatorArgs::default(); + let result = expr + .partition_evaluator(args)? + .evaluate_all_with_rank(num_rows, &ranks)?; + let result = as_float64_array(&result)?; + let result = result.values(); + assert_eq!(expected, *result); + Ok(()) + } + #[test] fn test_rank() -> Result<()> { - let r = Rank::default(); + let r = Rank::basic(); test_without_rank(&r, vec![1; 8])?; test_with_rank(&r, vec![1, 1, 3, 4, 4, 4, 7, 8])?; Ok(()) } + + #[test] + fn test_dense_rank() -> Result<()> { + let r = Rank::dense_rank(); + test_without_rank(&r, vec![1; 8])?; + test_with_rank(&r, vec![1, 1, 2, 3, 3, 3, 4, 5])?; + Ok(()) + } + + #[test] + #[allow(clippy::single_range_in_vec_init)] + fn test_percent_rank() -> Result<()> { + let r = Rank::percent_rank(); + + // empty case + let expected = vec![0.0; 0]; + test_f64_result(&r, 0, vec![0..0; 0], expected)?; + + // singleton case + let expected = vec![0.0]; + test_f64_result(&r, 1, vec![0..1], expected)?; + + // uniform case + let expected = vec![0.0; 7]; + test_f64_result(&r, 7, vec![0..7], expected)?; + + // non-trivial case + let expected = vec![0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5]; + test_f64_result(&r, 7, vec![0..3, 3..7], expected)?; + + Ok(()) + } } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 8c9b368598de..ffa8fc1eefe9 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -47,9 +47,7 @@ use datafusion::functions_aggregate::expr_fn::{ }; use datafusion::functions_aggregate::min_max::max_udaf; use datafusion::functions_nested::map::map; -use datafusion::functions_window::dense_rank::dense_rank; -use datafusion::functions_window::percent_rank::percent_rank; -use datafusion::functions_window::rank::{rank, rank_udwf}; +use datafusion::functions_window::rank::{dense_rank, percent_rank, rank, rank_udwf}; use datafusion::functions_window::row_number::row_number; use datafusion::prelude::*; use datafusion::test_util::{TestTableFactory, TestTableProvider}; diff --git a/docs/source/user-guide/sql/window_functions_new.md b/docs/source/user-guide/sql/window_functions_new.md index ee981911f5cb..462fc900d139 100644 --- a/docs/source/user-guide/sql/window_functions_new.md +++ b/docs/source/user-guide/sql/window_functions_new.md @@ -157,8 +157,35 @@ All [aggregate functions](aggregate_functions.md) can be used as window function ## Ranking Functions +- [dense_rank](#dense_rank) +- [percent_rank](#percent_rank) +- [rank](#rank) - [row_number](#row_number) +### `dense_rank` + +Returns the rank of the current row without gaps. This function ranks rows in a dense manner, meaning consecutive ranks are assigned even for identical values. + +``` +dense_rank() +``` + +### `percent_rank` + +Returns the percentage rank of the current row within its partition. The value ranges from 0 to 1 and is computed as `(rank - 1) / (total_rows - 1)`. + +``` +percent_rank() +``` + +### `rank` + +Returns the rank of the current row within its partition, allowing gaps between ranks. This function provides a ranking similar to `row_number`, but skips ranks for identical values. + +``` +rank() +``` + ### `row_number` Number of the current row within its partition, counting from 1.