From e8a94e18da40a5df127427b78f2dd8d3b0586767 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 8 Oct 2025 16:05:59 +0300 Subject: [PATCH 1/3] fix: actually generate a lot of unique values in benchmark table also added benchmark for testing pure grouping performance for more than 1 column. ---- I run this query for the data: ``` SELECT COUNT(*) AS total_count, COUNT(DISTINCT u64_wide) AS unique_count, COUNT(DISTINCT u64_wide) * 1.0 / COUNT(*) AS cardinality FROM t; ``` Before: ``` | total_count | unique_count | cardinality | | ----------- | ------------ | ----------- | | 65536 | 2048 | 0.03125 | ``` After: ``` | total_count | unique_count | cardinality | | ----------- | ------------ | ----------- | | 65536 | 65536 | 1.0 | ``` --- datafusion/core/benches/aggregate_query_sql.rs | 13 +++++++++++++ datafusion/core/benches/data_utils/mod.rs | 7 ++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs index 057a0e1d1b54..a090ece4d6c3 100644 --- a/datafusion/core/benches/aggregate_query_sql.rs +++ b/datafusion/core/benches/aggregate_query_sql.rs @@ -153,6 +153,19 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); + c.bench_function("aggregate_query_group_by_wide_u64_and_string_without_aggregate_expressions", |b| { + b.iter(|| { + query( + ctx.clone(), + &rt, + // Due to the large number of distinct values in u64_wide, + // this query test the actual grouping performance for more than 1 column + "SELECT u64_wide, utf8 \ + FROM t GROUP BY u64_wide, utf8", + ) + }) + }); + c.bench_function("aggregate_query_approx_percentile_cont_on_u64", |b| { b.iter(|| { query( diff --git a/datafusion/core/benches/data_utils/mod.rs b/datafusion/core/benches/data_utils/mod.rs index c0477b1306f7..a97a694f4a95 100644 --- a/datafusion/core/benches/data_utils/mod.rs +++ b/datafusion/core/benches/data_utils/mod.rs @@ -81,10 +81,7 @@ fn create_data(size: usize, null_density: f64) -> Vec> { .collect() } -fn create_integer_data(size: usize, value_density: f64) -> Vec> { - // use random numbers to avoid spurious compiler optimizations wrt to branching - let mut rng = StdRng::seed_from_u64(42); - +fn create_integer_data(rng: &mut StdRng, size: usize, value_density: f64) -> Vec> { (0..size) .map(|_| { if rng.random::() > value_density { @@ -116,7 +113,7 @@ fn create_record_batch( let values = create_data(batch_size, 0.5); // Integer values between [0, u64::MAX]. - let integer_values_wide = create_integer_data(batch_size, 9.0); + let integer_values_wide = create_integer_data(rng, batch_size, 9.0); // Integer values between [0, 9]. let integer_values_narrow = (0..batch_size) From 155868df9ad86f0c555ea0a812aff1ab76848547 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 8 Oct 2025 16:10:47 +0300 Subject: [PATCH 2/3] format --- .../core/benches/aggregate_query_sql.rs | 25 +++++++++++-------- datafusion/core/benches/data_utils/mod.rs | 6 ++++- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs index a090ece4d6c3..26355fcadf56 100644 --- a/datafusion/core/benches/aggregate_query_sql.rs +++ b/datafusion/core/benches/aggregate_query_sql.rs @@ -153,18 +153,21 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("aggregate_query_group_by_wide_u64_and_string_without_aggregate_expressions", |b| { - b.iter(|| { - query( - ctx.clone(), - &rt, - // Due to the large number of distinct values in u64_wide, - // this query test the actual grouping performance for more than 1 column - "SELECT u64_wide, utf8 \ + c.bench_function( + "aggregate_query_group_by_wide_u64_and_string_without_aggregate_expressions", + |b| { + b.iter(|| { + query( + ctx.clone(), + &rt, + // Due to the large number of distinct values in u64_wide, + // this query test the actual grouping performance for more than 1 column + "SELECT u64_wide, utf8 \ FROM t GROUP BY u64_wide, utf8", - ) - }) - }); + ) + }) + }, + ); c.bench_function("aggregate_query_approx_percentile_cont_on_u64", |b| { b.iter(|| { diff --git a/datafusion/core/benches/data_utils/mod.rs b/datafusion/core/benches/data_utils/mod.rs index a97a694f4a95..fffe2e2d1752 100644 --- a/datafusion/core/benches/data_utils/mod.rs +++ b/datafusion/core/benches/data_utils/mod.rs @@ -81,7 +81,11 @@ fn create_data(size: usize, null_density: f64) -> Vec> { .collect() } -fn create_integer_data(rng: &mut StdRng, size: usize, value_density: f64) -> Vec> { +fn create_integer_data( + rng: &mut StdRng, + size: usize, + value_density: f64, +) -> Vec> { (0..size) .map(|_| { if rng.random::() > value_density { From f4d0373a9a4559f9eea9f515a5796efea7b4caa1 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 8 Oct 2025 16:27:29 +0300 Subject: [PATCH 3/3] added multi group by benchmark on primitive only columns --- datafusion/core/benches/aggregate_query_sql.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs index 26355fcadf56..9da341ce2e92 100644 --- a/datafusion/core/benches/aggregate_query_sql.rs +++ b/datafusion/core/benches/aggregate_query_sql.rs @@ -169,6 +169,22 @@ fn criterion_benchmark(c: &mut Criterion) { }, ); + c.bench_function( + "aggregate_query_group_by_wide_u64_and_f32_without_aggregate_expressions", + |b| { + b.iter(|| { + query( + ctx.clone(), + &rt, + // Due to the large number of distinct values in u64_wide, + // this query test the actual grouping performance for more than 1 column + "SELECT u64_wide, f32 \ + FROM t GROUP BY u64_wide, f32", + ) + }) + }, + ); + c.bench_function("aggregate_query_approx_percentile_cont_on_u64", |b| { b.iter(|| { query(