From e8a94e18da40a5df127427b78f2dd8d3b0586767 Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Wed, 8 Oct 2025 16:05:59 +0300
Subject: [PATCH 1/3] fix: actually generate a lot of unique values in
benchmark table
also added benchmark for testing pure grouping performance for more than 1 column.
----
I run this query for the data:
```
SELECT
COUNT(*) AS total_count,
COUNT(DISTINCT u64_wide) AS unique_count,
COUNT(DISTINCT u64_wide) * 1.0 / COUNT(*) AS cardinality
FROM t;
```
Before:
```
| total_count | unique_count | cardinality |
| ----------- | ------------ | ----------- |
| 65536 | 2048 | 0.03125 |
```
After:
```
| total_count | unique_count | cardinality |
| ----------- | ------------ | ----------- |
| 65536 | 65536 | 1.0 |
```
---
datafusion/core/benches/aggregate_query_sql.rs | 13 +++++++++++++
datafusion/core/benches/data_utils/mod.rs | 7 ++-----
2 files changed, 15 insertions(+), 5 deletions(-)
diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs
index 057a0e1d1b54..a090ece4d6c3 100644
--- a/datafusion/core/benches/aggregate_query_sql.rs
+++ b/datafusion/core/benches/aggregate_query_sql.rs
@@ -153,6 +153,19 @@ fn criterion_benchmark(c: &mut Criterion) {
})
});
+ c.bench_function("aggregate_query_group_by_wide_u64_and_string_without_aggregate_expressions", |b| {
+ b.iter(|| {
+ query(
+ ctx.clone(),
+ &rt,
+ // Due to the large number of distinct values in u64_wide,
+ // this query test the actual grouping performance for more than 1 column
+ "SELECT u64_wide, utf8 \
+ FROM t GROUP BY u64_wide, utf8",
+ )
+ })
+ });
+
c.bench_function("aggregate_query_approx_percentile_cont_on_u64", |b| {
b.iter(|| {
query(
diff --git a/datafusion/core/benches/data_utils/mod.rs b/datafusion/core/benches/data_utils/mod.rs
index c0477b1306f7..a97a694f4a95 100644
--- a/datafusion/core/benches/data_utils/mod.rs
+++ b/datafusion/core/benches/data_utils/mod.rs
@@ -81,10 +81,7 @@ fn create_data(size: usize, null_density: f64) -> Vec