-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Add benchmark for array_has/array_has_all/array_has_any #18729
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
818c12f
33125fa
cabcaa9
bc98d36
46d157f
5f32868
e9ecc15
b067f23
e7a723b
8cadff6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,377 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #[macro_use] | ||
| extern crate criterion; | ||
|
|
||
| use criterion::{BenchmarkId, Criterion}; | ||
| use datafusion_expr::lit; | ||
| use datafusion_functions_nested::expr_fn::{ | ||
| array_has, array_has_all, array_has_any, make_array, | ||
| }; | ||
| use std::hint::black_box; | ||
|
|
||
| // If not explicitly stated, `array` and `array_size` refer to the haystack array. | ||
| fn criterion_benchmark(c: &mut Criterion) { | ||
| // Test different array sizes | ||
| let array_sizes = vec![1, 10, 100, 1000, 10000]; | ||
|
|
||
| for &size in &array_sizes { | ||
| bench_array_has(c, size); | ||
| bench_array_has_all(c, size); | ||
| bench_array_has_any(c, size); | ||
| } | ||
|
|
||
| // Specific benchmarks for string arrays (common use case) | ||
| bench_array_has_strings(c); | ||
| bench_array_has_all_strings(c); | ||
| bench_array_has_any_strings(c); | ||
|
|
||
| // Edge cases | ||
| bench_array_has_edge_cases(c); | ||
| } | ||
|
|
||
| fn bench_array_has(c: &mut Criterion, array_size: usize) { | ||
| let mut group = c.benchmark_group("array_has_i64"); | ||
|
|
||
| // Benchmark: element found at beginning | ||
| group.bench_with_input( | ||
| BenchmarkId::new("found_at_start", array_size), | ||
| &array_size, | ||
| |b, &size| { | ||
| let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This generates sequential data which is not very realistic. Consider generating skewed and/or non-sequential uniform data. |
||
| let list_array = make_array(array); | ||
| let needle = lit(0_i64); | ||
|
|
||
| b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here you test the time for building the expression, not the time to execute it! |
||
| }, | ||
| ); | ||
|
|
||
| // Benchmark: element found at end | ||
| group.bench_with_input( | ||
| BenchmarkId::new("found_at_end", array_size), | ||
| &array_size, | ||
| |b, &size| { | ||
| let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle = lit((size - 1) as i64); | ||
|
|
||
| b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For small arrays the cloning operation may dominate the time. Can you use a reference instead ? |
||
| }, | ||
| ); | ||
|
|
||
| // Benchmark: element not found | ||
| group.bench_with_input( | ||
| BenchmarkId::new("not_found", array_size), | ||
| &array_size, | ||
| |b, &size| { | ||
| let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle = lit(-1_i64); // Not in array | ||
|
|
||
| b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) | ||
| }, | ||
| ); | ||
|
|
||
| group.finish(); | ||
| } | ||
|
|
||
| fn bench_array_has_all(c: &mut Criterion, array_size: usize) { | ||
| let mut group = c.benchmark_group("array_has_all"); | ||
|
|
||
| // Benchmark: all elements found (small needle) | ||
| group.bench_with_input( | ||
| BenchmarkId::new("all_found_small_needle", array_size), | ||
| &array_size, | ||
| |b, &size| { | ||
| let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle_array = make_array(vec![lit(0_i64), lit(1_i64), lit(2_i64)]); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://github.com/apache/datafusion/pull/18729/files#diff-ad38fd714a1e2020f92ec297d8d5fcb911fa5a7f3417ce965c44c06e462dbd74R35 calls this method with a |
||
|
|
||
| b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) | ||
| }, | ||
| ); | ||
|
|
||
| // Benchmark: all elements found (medium needle - 10% of haystack) | ||
| group.bench_with_input( | ||
| BenchmarkId::new("all_found_medium_needle", array_size), | ||
| &array_size, | ||
| |b, &size| { | ||
| let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle_size = (size / 10).max(1); | ||
| let needle = (0..needle_size).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
| let needle_array = make_array(needle); | ||
|
|
||
| b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) | ||
| }, | ||
| ); | ||
|
|
||
| // Benchmark: not all found (early exit) | ||
| group.bench_with_input( | ||
| BenchmarkId::new("early_exit", array_size), | ||
| &array_size, | ||
| |b, &size| { | ||
| let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle_array = make_array(vec![lit(0_i64), lit(-1_i64)]); // -1 not in array | ||
|
|
||
| b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) | ||
| }, | ||
| ); | ||
|
|
||
| group.finish(); | ||
| } | ||
|
|
||
| fn bench_array_has_any(c: &mut Criterion, array_size: usize) { | ||
| let mut group = c.benchmark_group("array_has_any"); | ||
|
|
||
| // Benchmark: first element matches (best case) | ||
| group.bench_with_input( | ||
| BenchmarkId::new("first_match", array_size), | ||
| &array_size, | ||
| |b, &size| { | ||
| let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle_array = make_array(vec![lit(0_i64), lit(-1_i64), lit(-2_i64)]); | ||
|
|
||
| b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) | ||
| }, | ||
| ); | ||
|
|
||
| // Benchmark: last element matches (worst case) | ||
| group.bench_with_input( | ||
| BenchmarkId::new("last_match", array_size), | ||
| &array_size, | ||
| |b, &size| { | ||
| let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle_array = make_array(vec![lit(-1_i64), lit(-2_i64), lit(0_i64)]); | ||
|
|
||
| b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) | ||
| }, | ||
| ); | ||
|
|
||
| // Benchmark: no match | ||
| group.bench_with_input( | ||
| BenchmarkId::new("no_match", array_size), | ||
| &array_size, | ||
| |b, &size| { | ||
| let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle_array = make_array(vec![lit(-1_i64), lit(-2_i64), lit(-3_i64)]); | ||
|
|
||
| b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) | ||
| }, | ||
| ); | ||
|
|
||
| group.finish(); | ||
| } | ||
|
|
||
| fn bench_array_has_strings(c: &mut Criterion) { | ||
| let mut group = c.benchmark_group("array_has_strings"); | ||
|
|
||
| // Benchmark with string arrays (common use case for tickers, tags, etc.) | ||
| let sizes = vec![10, 100, 1000]; | ||
|
|
||
| for &size in &sizes { | ||
| group.bench_with_input(BenchmarkId::new("found", size), &size, |b, &size| { | ||
| let array = (0..size) | ||
| .map(|i| lit(format!("TICKER{i:04}"))) | ||
| .collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle = lit("TICKER0005"); | ||
|
|
||
| b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) | ||
| }); | ||
|
|
||
| group.bench_with_input(BenchmarkId::new("not_found", size), &size, |b, &size| { | ||
| let array = (0..size) | ||
| .map(|i| lit(format!("TICKER{i:04}"))) | ||
| .collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle = lit("NOTFOUND"); | ||
|
|
||
| b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) | ||
| }); | ||
| } | ||
|
|
||
| group.finish(); | ||
| } | ||
|
|
||
| fn bench_array_has_all_strings(c: &mut Criterion) { | ||
| let mut group = c.benchmark_group("array_has_all_strings"); | ||
|
|
||
| // Realistic scenario: checking if a portfolio contains certain tickers | ||
| let portfolio_size = 100; | ||
| let check_sizes = vec![1, 3, 5, 10]; | ||
|
|
||
| for &check_size in &check_sizes { | ||
| group.bench_with_input( | ||
| BenchmarkId::new("all_found", check_size), | ||
| &check_size, | ||
| |b, &check_size| { | ||
| let portfolio = (0..portfolio_size) | ||
| .map(|i| lit(format!("TICKER{i:04}"))) | ||
| .collect::<Vec<_>>(); | ||
| let list_array = make_array(portfolio); | ||
|
|
||
| let checking = (0..check_size) | ||
| .map(|i| lit(format!("TICKER{i:04}"))) | ||
| .collect::<Vec<_>>(); | ||
| let needle_array = make_array(checking); | ||
|
|
||
| b.iter(|| { | ||
| black_box(array_has_all(list_array.clone(), needle_array.clone())) | ||
| }) | ||
| }, | ||
| ); | ||
|
|
||
| group.bench_with_input( | ||
| BenchmarkId::new("some_missing", check_size), | ||
| &check_size, | ||
| |b, &check_size| { | ||
| let portfolio = (0..portfolio_size) | ||
| .map(|i| lit(format!("TICKER{i:04}"))) | ||
| .collect::<Vec<_>>(); | ||
| let list_array = make_array(portfolio); | ||
|
|
||
| let mut checking = (0..check_size - 1) | ||
| .map(|i| lit(format!("TICKER{i:04}"))) | ||
| .collect::<Vec<_>>(); | ||
| checking.push(lit("NOTFOUND".to_string())); | ||
| let needle_array = make_array(checking); | ||
|
|
||
| b.iter(|| { | ||
| black_box(array_has_all(list_array.clone(), needle_array.clone())) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
|
|
||
| group.finish(); | ||
| } | ||
|
|
||
| fn bench_array_has_any_strings(c: &mut Criterion) { | ||
| let mut group = c.benchmark_group("array_has_any_strings"); | ||
|
|
||
| let portfolio_size = 100; | ||
| let check_sizes = vec![1, 3, 5, 10]; | ||
|
|
||
| for &check_size in &check_sizes { | ||
| group.bench_with_input( | ||
| BenchmarkId::new("first_matches", check_size), | ||
| &check_size, | ||
| |b, &check_size| { | ||
| let portfolio = (0..portfolio_size) | ||
| .map(|i| lit(format!("TICKER{i:04}"))) | ||
| .collect::<Vec<_>>(); | ||
| let list_array = make_array(portfolio); | ||
|
|
||
| let mut checking = vec![lit("TICKER0000".to_string())]; | ||
| checking.extend((1..check_size).map(|_| lit("NOTFOUND".to_string()))); | ||
| let needle_array = make_array(checking); | ||
|
|
||
| b.iter(|| { | ||
| black_box(array_has_any(list_array.clone(), needle_array.clone())) | ||
| }) | ||
| }, | ||
| ); | ||
|
|
||
| group.bench_with_input( | ||
| BenchmarkId::new("none_match", check_size), | ||
| &check_size, | ||
| |b, &check_size| { | ||
| let portfolio = (0..portfolio_size) | ||
| .map(|i| lit(format!("TICKER{i:04}"))) | ||
| .collect::<Vec<_>>(); | ||
| let list_array = make_array(portfolio); | ||
|
|
||
| let checking = (0..check_size) | ||
| .map(|i| lit(format!("NOTFOUND{i}"))) | ||
| .collect::<Vec<_>>(); | ||
| let needle_array = make_array(checking); | ||
|
|
||
| b.iter(|| { | ||
| black_box(array_has_any(list_array.clone(), needle_array.clone())) | ||
| }) | ||
| }, | ||
| ); | ||
| } | ||
|
|
||
| group.finish(); | ||
| } | ||
|
|
||
| fn bench_array_has_edge_cases(c: &mut Criterion) { | ||
| let mut group = c.benchmark_group("array_has_edge_cases"); | ||
|
|
||
| // Empty array | ||
| group.bench_function("empty_array", |b| { | ||
| let list_array = make_array(vec![]); | ||
| let needle = lit(1_i64); | ||
|
|
||
| b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) | ||
| }); | ||
|
|
||
| // Single element array - found | ||
| group.bench_function("single_element_found", |b| { | ||
| let list_array = make_array(vec![lit(1_i64)]); | ||
| let needle = lit(1_i64); | ||
|
|
||
| b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) | ||
| }); | ||
|
|
||
| // Single element array - not found | ||
| group.bench_function("single_element_not_found", |b| { | ||
| let list_array = make_array(vec![lit(1_i64)]); | ||
| let needle = lit(2_i64); | ||
|
|
||
| b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) | ||
| }); | ||
|
|
||
| // Array with duplicates | ||
| group.bench_function("array_with_duplicates", |b| { | ||
| let array = vec![lit(1_i64); 1000]; | ||
| let list_array = make_array(array); | ||
| let needle = lit(1_i64); | ||
|
|
||
| b.iter(|| black_box(array_has(list_array.clone(), needle.clone()))) | ||
| }); | ||
|
|
||
| // array_has_all: empty needle | ||
| group.bench_function("array_has_all_empty_needle", |b| { | ||
| let array = (0..1000).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle_array = make_array(vec![]); | ||
|
|
||
| b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone()))) | ||
| }); | ||
|
|
||
| // array_has_any: empty needle | ||
| group.bench_function("array_has_any_empty_needle", |b| { | ||
| let array = (0..1000).map(|i| lit(i as i64)).collect::<Vec<_>>(); | ||
| let list_array = make_array(array); | ||
| let needle_array = make_array(vec![]); | ||
|
|
||
| b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone()))) | ||
| }); | ||
|
|
||
| group.finish(); | ||
| } | ||
|
|
||
| criterion_group!(benches, criterion_benchmark); | ||
| criterion_main!(benches); | ||
Uh oh!
There was an error while loading. Please reload this page.