From e8a94e18da40a5df127427b78f2dd8d3b0586767 Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Wed, 8 Oct 2025 16:05:59 +0300
Subject: [PATCH 1/3] fix: actually generate a lot of unique values in
 benchmark table

also added benchmark for testing pure grouping performance for more than 1 column.

----

I run this query for the data:
```
SELECT
    COUNT(*) AS total_count,
    COUNT(DISTINCT u64_wide) AS unique_count,
    COUNT(DISTINCT u64_wide) * 1.0 / COUNT(*) AS cardinality
FROM t;
```

Before:
```
| total_count | unique_count | cardinality |
| ----------- | ------------ | ----------- |
|    65536    |    2048      |   0.03125   |
```

After:
```
| total_count | unique_count | cardinality |
| ----------- | ------------ | ----------- |
|    65536    |    65536     |     1.0     |
```
---
 datafusion/core/benches/aggregate_query_sql.rs | 13 +++++++++++++
 datafusion/core/benches/data_utils/mod.rs      |  7 ++-----
 2 files changed, 15 insertions(+), 5 deletions(-)
diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs
index 057a0e1d1b54..a090ece4d6c3 100644
--- a/datafusion/core/benches/aggregate_query_sql.rs
+++ b/datafusion/core/benches/aggregate_query_sql.rs
@@ -153,6 +153,19 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
+    c.bench_function("aggregate_query_group_by_wide_u64_and_string_without_aggregate_expressions", |b| {
+        b.iter(|| {
+            query(
+                ctx.clone(),
+                &rt,
+                // Due to the large number of distinct values in u64_wide,
+                // this query test the actual grouping performance for more than 1 column
+                "SELECT u64_wide, utf8 \
+                 FROM t GROUP BY u64_wide, utf8",
+            )
+        })
+    });
+
     c.bench_function("aggregate_query_approx_percentile_cont_on_u64", |b| {
         b.iter(|| {
             query(
diff --git a/datafusion/core/benches/data_utils/mod.rs b/datafusion/core/benches/data_utils/mod.rs
index c0477b1306f7..a97a694f4a95 100644
--- a/datafusion/core/benches/data_utils/mod.rs
+++ b/datafusion/core/benches/data_utils/mod.rs
@@ -81,10 +81,7 @@ fn create_data(size: usize, null_density: f64) -> Vec<Option<f64>> {
         .collect()
 }
 
-fn create_integer_data(size: usize, value_density: f64) -> Vec<Option<u64>> {
-    // use random numbers to avoid spurious compiler optimizations wrt to branching
-    let mut rng = StdRng::seed_from_u64(42);
-
+fn create_integer_data(rng: &mut StdRng, size: usize, value_density: f64) -> Vec<Option<u64>> {
     (0..size)
         .map(|_| {
             if rng.random::<f64>() > value_density {
@@ -116,7 +113,7 @@ fn create_record_batch(
     let values = create_data(batch_size, 0.5);
 
     // Integer values between [0, u64::MAX].
-    let integer_values_wide = create_integer_data(batch_size, 9.0);
+    let integer_values_wide = create_integer_data(rng, batch_size, 9.0);
 
     // Integer values between [0, 9].
     let integer_values_narrow = (0..batch_size)

From 155868df9ad86f0c555ea0a812aff1ab76848547 Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Wed, 8 Oct 2025 16:10:47 +0300
Subject: [PATCH 2/3] format

---
 .../core/benches/aggregate_query_sql.rs       | 25 +++++++++++--------
 datafusion/core/benches/data_utils/mod.rs     |  6 ++++-
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs
index a090ece4d6c3..26355fcadf56 100644
--- a/datafusion/core/benches/aggregate_query_sql.rs
+++ b/datafusion/core/benches/aggregate_query_sql.rs
@@ -153,18 +153,21 @@ fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
-    c.bench_function("aggregate_query_group_by_wide_u64_and_string_without_aggregate_expressions", |b| {
-        b.iter(|| {
-            query(
-                ctx.clone(),
-                &rt,
-                // Due to the large number of distinct values in u64_wide,
-                // this query test the actual grouping performance for more than 1 column
-                "SELECT u64_wide, utf8 \
+    c.bench_function(
+        "aggregate_query_group_by_wide_u64_and_string_without_aggregate_expressions",
+        |b| {
+            b.iter(|| {
+                query(
+                    ctx.clone(),
+                    &rt,
+                    // Due to the large number of distinct values in u64_wide,
+                    // this query test the actual grouping performance for more than 1 column
+                    "SELECT u64_wide, utf8 \
                  FROM t GROUP BY u64_wide, utf8",
-            )
-        })
-    });
+                )
+            })
+        },
+    );
 
     c.bench_function("aggregate_query_approx_percentile_cont_on_u64", |b| {
         b.iter(|| {
diff --git a/datafusion/core/benches/data_utils/mod.rs b/datafusion/core/benches/data_utils/mod.rs
index a97a694f4a95..fffe2e2d1752 100644
--- a/datafusion/core/benches/data_utils/mod.rs
+++ b/datafusion/core/benches/data_utils/mod.rs
@@ -81,7 +81,11 @@ fn create_data(size: usize, null_density: f64) -> Vec<Option<f64>> {
         .collect()
 }
 
-fn create_integer_data(rng: &mut StdRng, size: usize, value_density: f64) -> Vec<Option<u64>> {
+fn create_integer_data(
+    rng: &mut StdRng,
+    size: usize,
+    value_density: f64,
+) -> Vec<Option<u64>> {
     (0..size)
         .map(|_| {
             if rng.random::<f64>() > value_density {

From f4d0373a9a4559f9eea9f515a5796efea7b4caa1 Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Wed, 8 Oct 2025 16:27:29 +0300
Subject: [PATCH 3/3] added multi group by benchmark on primitive only columns

---
 datafusion/core/benches/aggregate_query_sql.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/datafusion/core/benches/aggregate_query_sql.rs b/datafusion/core/benches/aggregate_query_sql.rs
index 26355fcadf56..9da341ce2e92 100644
--- a/datafusion/core/benches/aggregate_query_sql.rs
+++ b/datafusion/core/benches/aggregate_query_sql.rs
@@ -169,6 +169,22 @@ fn criterion_benchmark(c: &mut Criterion) {
         },
     );
 
+    c.bench_function(
+        "aggregate_query_group_by_wide_u64_and_f32_without_aggregate_expressions",
+        |b| {
+            b.iter(|| {
+                query(
+                    ctx.clone(),
+                    &rt,
+                    // Due to the large number of distinct values in u64_wide,
+                    // this query test the actual grouping performance for more than 1 column
+                    "SELECT u64_wide, f32 \
+                 FROM t GROUP BY u64_wide, f32",
+                )
+            })
+        },
+    );
+
     c.bench_function("aggregate_query_approx_percentile_cont_on_u64", |b| {
         b.iter(|| {
             query(