Skip to content
4 changes: 4 additions & 0 deletions datafusion/functions-nested/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ rand = { workspace = true }
harness = false
name = "array_expression"

[[bench]]
harness = false
name = "array_has"

[[bench]]
harness = false
name = "array_reverse"
Expand Down
377 changes: 377 additions & 0 deletions datafusion/functions-nested/benches/array_has.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,377 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#[macro_use]
extern crate criterion;

use criterion::{BenchmarkId, Criterion};
use datafusion_expr::lit;
use datafusion_functions_nested::expr_fn::{
array_has, array_has_all, array_has_any, make_array,
};
use std::hint::black_box;

// If not explicitly stated, `array` and `array_size` refer to the haystack array.
fn criterion_benchmark(c: &mut Criterion) {
// Test different array sizes
let array_sizes = vec![1, 10, 100, 1000, 10000];

for &size in &array_sizes {
bench_array_has(c, size);
bench_array_has_all(c, size);
bench_array_has_any(c, size);
}

// Specific benchmarks for string arrays (common use case)
bench_array_has_strings(c);
bench_array_has_all_strings(c);
bench_array_has_any_strings(c);

// Edge cases
bench_array_has_edge_cases(c);
}

fn bench_array_has(c: &mut Criterion, array_size: usize) {
let mut group = c.benchmark_group("array_has_i64");

// Benchmark: element found at beginning
group.bench_with_input(
BenchmarkId::new("found_at_start", array_size),
&array_size,
|b, &size| {
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This generates sequential data which is not very realistic. Consider generating skewed and/or non-sequential uniform data.

let list_array = make_array(array);
let needle = lit(0_i64);

b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here you test the time for building the expression, not the time to execute it!
See how array_reverse bench uses array_reverse_inner() to execute it - https://github.com/zhuqi-lucas/arrow-datafusion/blob/e7a723b06fb8b164033514ac73cddaf1b512eac7/datafusion/functions-nested/benches/array_reverse.rs#L32-L34
or the map bench uses invoke_with_args() at https://github.com/zhuqi-lucas/arrow-datafusion/blob/e7a723b06fb8b164033514ac73cddaf1b512eac7/datafusion/functions-nested/benches/map.rs#L114

},
);

// Benchmark: element found at end
group.bench_with_input(
BenchmarkId::new("found_at_end", array_size),
&array_size,
|b, &size| {
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
let list_array = make_array(array);
let needle = lit((size - 1) as i64);

b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For small arrays the cloning operation may dominate the time. Can you use a reference instead ?

},
);

// Benchmark: element not found
group.bench_with_input(
BenchmarkId::new("not_found", array_size),
&array_size,
|b, &size| {
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
let list_array = make_array(array);
let needle = lit(-1_i64); // Not in array

b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
},
);

group.finish();
}

fn bench_array_has_all(c: &mut Criterion, array_size: usize) {
let mut group = c.benchmark_group("array_has_all");

// Benchmark: all elements found (small needle)
group.bench_with_input(
BenchmarkId::new("all_found_small_needle", array_size),
&array_size,
|b, &size| {
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
let list_array = make_array(array);
let needle_array = make_array(vec![lit(0_i64), lit(1_i64), lit(2_i64)]);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone())))
},
);

// Benchmark: all elements found (medium needle - 10% of haystack)
group.bench_with_input(
BenchmarkId::new("all_found_medium_needle", array_size),
&array_size,
|b, &size| {
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
let list_array = make_array(array);
let needle_size = (size / 10).max(1);
let needle = (0..needle_size).map(|i| lit(i as i64)).collect::<Vec<_>>();
let needle_array = make_array(needle);

b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone())))
},
);

// Benchmark: not all found (early exit)
group.bench_with_input(
BenchmarkId::new("early_exit", array_size),
&array_size,
|b, &size| {
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
let list_array = make_array(array);
let needle_array = make_array(vec![lit(0_i64), lit(-1_i64)]); // -1 not in array

b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone())))
},
);

group.finish();
}

fn bench_array_has_any(c: &mut Criterion, array_size: usize) {
let mut group = c.benchmark_group("array_has_any");

// Benchmark: first element matches (best case)
group.bench_with_input(
BenchmarkId::new("first_match", array_size),
&array_size,
|b, &size| {
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
let list_array = make_array(array);
let needle_array = make_array(vec![lit(0_i64), lit(-1_i64), lit(-2_i64)]);

b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone())))
},
);

// Benchmark: last element matches (worst case)
group.bench_with_input(
BenchmarkId::new("last_match", array_size),
&array_size,
|b, &size| {
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
let list_array = make_array(array);
let needle_array = make_array(vec![lit(-1_i64), lit(-2_i64), lit(0_i64)]);

b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone())))
},
);

// Benchmark: no match
group.bench_with_input(
BenchmarkId::new("no_match", array_size),
&array_size,
|b, &size| {
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
let list_array = make_array(array);
let needle_array = make_array(vec![lit(-1_i64), lit(-2_i64), lit(-3_i64)]);

b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone())))
},
);

group.finish();
}

fn bench_array_has_strings(c: &mut Criterion) {
let mut group = c.benchmark_group("array_has_strings");

// Benchmark with string arrays (common use case for tickers, tags, etc.)
let sizes = vec![10, 100, 1000];

for &size in &sizes {
group.bench_with_input(BenchmarkId::new("found", size), &size, |b, &size| {
let array = (0..size)
.map(|i| lit(format!("TICKER{i:04}")))
.collect::<Vec<_>>();
let list_array = make_array(array);
let needle = lit("TICKER0005");

b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
});

group.bench_with_input(BenchmarkId::new("not_found", size), &size, |b, &size| {
let array = (0..size)
.map(|i| lit(format!("TICKER{i:04}")))
.collect::<Vec<_>>();
let list_array = make_array(array);
let needle = lit("NOTFOUND");

b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
});
}

group.finish();
}

fn bench_array_has_all_strings(c: &mut Criterion) {
let mut group = c.benchmark_group("array_has_all_strings");

// Realistic scenario: checking if a portfolio contains certain tickers
let portfolio_size = 100;
let check_sizes = vec![1, 3, 5, 10];

for &check_size in &check_sizes {
group.bench_with_input(
BenchmarkId::new("all_found", check_size),
&check_size,
|b, &check_size| {
let portfolio = (0..portfolio_size)
.map(|i| lit(format!("TICKER{i:04}")))
.collect::<Vec<_>>();
let list_array = make_array(portfolio);

let checking = (0..check_size)
.map(|i| lit(format!("TICKER{i:04}")))
.collect::<Vec<_>>();
let needle_array = make_array(checking);

b.iter(|| {
black_box(array_has_all(list_array.clone(), needle_array.clone()))
})
},
);

group.bench_with_input(
BenchmarkId::new("some_missing", check_size),
&check_size,
|b, &check_size| {
let portfolio = (0..portfolio_size)
.map(|i| lit(format!("TICKER{i:04}")))
.collect::<Vec<_>>();
let list_array = make_array(portfolio);

let mut checking = (0..check_size - 1)
.map(|i| lit(format!("TICKER{i:04}")))
.collect::<Vec<_>>();
checking.push(lit("NOTFOUND".to_string()));
let needle_array = make_array(checking);

b.iter(|| {
black_box(array_has_all(list_array.clone(), needle_array.clone()))
})
},
);
}

group.finish();
}

fn bench_array_has_any_strings(c: &mut Criterion) {
let mut group = c.benchmark_group("array_has_any_strings");

let portfolio_size = 100;
let check_sizes = vec![1, 3, 5, 10];

for &check_size in &check_sizes {
group.bench_with_input(
BenchmarkId::new("first_matches", check_size),
&check_size,
|b, &check_size| {
let portfolio = (0..portfolio_size)
.map(|i| lit(format!("TICKER{i:04}")))
.collect::<Vec<_>>();
let list_array = make_array(portfolio);

let mut checking = vec![lit("TICKER0000".to_string())];
checking.extend((1..check_size).map(|_| lit("NOTFOUND".to_string())));
let needle_array = make_array(checking);

b.iter(|| {
black_box(array_has_any(list_array.clone(), needle_array.clone()))
})
},
);

group.bench_with_input(
BenchmarkId::new("none_match", check_size),
&check_size,
|b, &check_size| {
let portfolio = (0..portfolio_size)
.map(|i| lit(format!("TICKER{i:04}")))
.collect::<Vec<_>>();
let list_array = make_array(portfolio);

let checking = (0..check_size)
.map(|i| lit(format!("NOTFOUND{i}")))
.collect::<Vec<_>>();
let needle_array = make_array(checking);

b.iter(|| {
black_box(array_has_any(list_array.clone(), needle_array.clone()))
})
},
);
}

group.finish();
}

fn bench_array_has_edge_cases(c: &mut Criterion) {
let mut group = c.benchmark_group("array_has_edge_cases");

// Empty array
group.bench_function("empty_array", |b| {
let list_array = make_array(vec![]);
let needle = lit(1_i64);

b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
});

// Single element array - found
group.bench_function("single_element_found", |b| {
let list_array = make_array(vec![lit(1_i64)]);
let needle = lit(1_i64);

b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
});

// Single element array - not found
group.bench_function("single_element_not_found", |b| {
let list_array = make_array(vec![lit(1_i64)]);
let needle = lit(2_i64);

b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
});

// Array with duplicates
group.bench_function("array_with_duplicates", |b| {
let array = vec![lit(1_i64); 1000];
let list_array = make_array(array);
let needle = lit(1_i64);

b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
});

// array_has_all: empty needle
group.bench_function("array_has_all_empty_needle", |b| {
let array = (0..1000).map(|i| lit(i as i64)).collect::<Vec<_>>();
let list_array = make_array(array);
let needle_array = make_array(vec![]);

b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone())))
});

// array_has_any: empty needle
group.bench_function("array_has_any_empty_needle", |b| {
let array = (0..1000).map(|i| lit(i as i64)).collect::<Vec<_>>();
let list_array = make_array(array);
let needle_array = make_array(vec![]);

b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone())))
});

group.finish();
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);