Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-10609: [Rust] Optimize min/max of non null strings #8673

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
44 changes: 36 additions & 8 deletions rust/arrow/benches/aggregate_kernels.rs
Expand Up @@ -19,16 +19,15 @@
extern crate criterion;
use criterion::Criterion;

use rand::Rng;
use std::sync::Arc;
use rand::{distributions::Alphanumeric, Rng};

extern crate arrow;

use arrow::array::*;
use arrow::compute::kernels::aggregate::*;
use arrow::util::test_util::seedable_rng;

fn create_array(size: usize, with_nulls: bool) -> ArrayRef {
fn create_array(size: usize, with_nulls: bool) -> Float32Array {
// use random numbers to avoid spurious compiler optimizations wrt to branching
let mut rng = seedable_rng();
let mut builder = Float32Builder::new(size);
Expand All @@ -40,19 +39,40 @@ fn create_array(size: usize, with_nulls: bool) -> ArrayRef {
builder.append_value(rng.gen()).unwrap();
}
}
Arc::new(builder.finish())
builder.finish()
}

fn bench_sum(arr_a: &ArrayRef) {
let arr_a = arr_a.as_any().downcast_ref::<Float32Array>().unwrap();
fn create_string_array(size: usize, with_nulls: bool) -> StringArray {
// use random numbers to avoid spurious compiler optimizations wrt to branching
let mut rng = seedable_rng();
let mut builder = StringBuilder::new(size);

for _ in 0..size {
if with_nulls && rng.gen::<f32>() > 0.5 {
builder.append_null().unwrap();
} else {
let string = seedable_rng()
.sample_iter(&Alphanumeric)
.take(10)
.collect::<String>();
builder.append_value(&string).unwrap();
}
}
builder.finish()
}

fn bench_sum(arr_a: &Float32Array) {
criterion::black_box(sum(&arr_a).unwrap());
}

fn bench_min(arr_a: &ArrayRef) {
let arr_a = arr_a.as_any().downcast_ref::<Float32Array>().unwrap();
fn bench_min(arr_a: &Float32Array) {
criterion::black_box(min(&arr_a).unwrap());
}

fn bench_min_string(arr_a: &StringArray) {
criterion::black_box(min_string(&arr_a).unwrap());
}

fn add_benchmark(c: &mut Criterion) {
let arr_a = create_array(512, false);

Expand All @@ -63,6 +83,14 @@ fn add_benchmark(c: &mut Criterion) {

c.bench_function("sum nulls 512", |b| b.iter(|| bench_sum(&arr_a)));
c.bench_function("min nulls 512", |b| b.iter(|| bench_min(&arr_a)));

let arr_b = create_string_array(512, false);
c.bench_function("min string 512", |b| b.iter(|| bench_min_string(&arr_b)));

let arr_b = create_string_array(512, true);
c.bench_function("min nulls string 512", |b| {
b.iter(|| bench_min_string(&arr_b))
});
}

criterion_group!(benches, add_benchmark);
Expand Down
23 changes: 10 additions & 13 deletions rust/arrow/src/compute/kernels/aggregate.rs
Expand Up @@ -32,19 +32,20 @@ fn min_max_string<T: StringOffsetSizeTrait, F: Fn(&str, &str) -> bool>(
if null_count == array.len() {
return None;
}
let mut n = "";
let mut has_value = false;
let data = array.data();

let mut n;
if null_count == 0 {
for i in 0..data.len() {
n = array.value(0);
for i in 1..data.len() {
let item = array.value(i);
if !has_value || cmp(&n, item) {
has_value = true;
if cmp(&n, item) {
n = item;
}
}
Comment on lines +39 to 44
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels like this code can use fold with your changes now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I translated the other min_max_helper into using fold. couldn't figure out to convert min_max_string to use fold here.

} else {
n = "";
let mut has_value = false;

for i in 0..data.len() {
let item = array.value(i);
if data.is_valid(i) && (!has_value || cmp(&n, item)) {
Expand Down Expand Up @@ -105,13 +106,9 @@ where

if null_count == 0 {
// optimized path for arrays without null values
n = m[0];

for item in &m[1..] {
if cmp(&n, item) {
n = *item
}
}
n = m[1..]
.iter()
.fold(m[0], |max, item| if cmp(&max, item) { *item } else { max });
} else {
n = T::default_value();
let mut has_value = false;
Expand Down
6 changes: 3 additions & 3 deletions rust/arrow/src/compute/kernels/cast.rs
Expand Up @@ -2740,14 +2740,14 @@ mod tests {
(Ok(_), false) => {
panic!("Was able to cast array from {:?} to {:?} but can_cast_types reported false",
array.data_type(), to_type)
},
}
(Err(e), true) => {
panic!("Was not able to cast array from {:?} to {:?} but can_cast_types reported true. \
Error was {:?}",
array.data_type(), to_type, e)
},
}
// otherwise it was a match
_=> {},
_ => {}
};
}
}
Expand Down