From 67ae4512d32fada4d81cb9b42aebc40d814ef601 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 25 Oct 2022 17:49:18 +1300 Subject: [PATCH 1/3] Add lexsort benchmark (#2871) --- arrow/Cargo.toml | 5 + arrow/benches/lexsort.rs | 166 ++++++++++++++++++++++++++++++ arrow/src/compute/kernels/sort.rs | 14 ++- arrow/src/row/mod.rs | 18 ++++ arrow/src/util/bench_util.rs | 38 +++++++ 5 files changed, 236 insertions(+), 5 deletions(-) create mode 100644 arrow/benches/lexsort.rs diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 4a1668cc0fd..f20d778aee0 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -259,3 +259,8 @@ required-features = ["test_utils"] name = "bitwise_kernel" harness = false required-features = ["test_utils"] + +[[bench]] +name = "lexsort" +harness = false +required-features = ["test_utils"] diff --git a/arrow/benches/lexsort.rs b/arrow/benches/lexsort.rs new file mode 100644 index 00000000000..8dc22de348e --- /dev/null +++ b/arrow/benches/lexsort.rs @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::compute::{lexsort_to_indices, SortColumn}; +use arrow::row::{RowConverter, SortField}; +use arrow::util::bench_util::{ + create_dict_from_values, create_primitive_array, create_string_array_with_len, +}; +use arrow_array::types::Int32Type; +use arrow_array::{Array, ArrayRef, UInt32Array}; +use criterion::{criterion_group, criterion_main, Criterion}; +use std::sync::Arc; + +#[derive(Copy, Clone)] +enum Column { + RequiredI32, + OptionalI32, + Required16CharString, + Optional16CharString, + Optional50CharString, + Optional100Value50CharStringDict, +} + +impl std::fmt::Debug for Column { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Column::RequiredI32 => "i32", + Column::OptionalI32 => "i32_opt", + Column::Required16CharString => "str(16)", + Column::Optional16CharString => "str_opt(16)", + Column::Optional50CharString => "str_opt(50)", + Column::Optional100Value50CharStringDict => "dict(100,str_opt(50))", + }; + f.write_str(s) + } +} + +impl Column { + fn generate(self, size: usize) -> ArrayRef { + match self { + Column::RequiredI32 => { + Arc::new(create_primitive_array::(size, 0.)) + } + Column::OptionalI32 => { + Arc::new(create_primitive_array::(size, 0.2)) + } + Column::Required16CharString => { + Arc::new(create_string_array_with_len::(size, 0., 16)) + } + Column::Optional16CharString => { + Arc::new(create_string_array_with_len::(size, 0.2, 16)) + } + Column::Optional50CharString => { + Arc::new(create_string_array_with_len::(size, 0., 50)) + } + Column::Optional100Value50CharStringDict => { + Arc::new(create_dict_from_values::( + size, + 0.1, + &create_string_array_with_len::(100, 0., 50), + )) + } + } + } +} + +fn do_bench(c: &mut Criterion, columns: &[Column], len: usize) { + let arrays: Vec<_> = columns.iter().map(|x| x.generate(len)).collect(); + let sort_columns: Vec<_> = arrays + .iter() + .cloned() + .map(|values| SortColumn { + values, + options: None, + }) + .collect(); + + c.bench_function(&format!("lexsort_to_indices({:?}): {}", columns, len), |b| { + b.iter(|| criterion::black_box(lexsort_to_indices(&sort_columns, None).unwrap())) + }); + + c.bench_function(&format!("lexsort_rows({:?}): {}", columns, len), |b| { + b.iter(|| { + criterion::black_box({ + let fields = arrays + .iter() + .map(|a| SortField::new(a.data_type().clone())) + .collect(); + let mut converter = RowConverter::new(fields); + let rows = converter.convert_columns(&arrays).unwrap(); + let mut sort: Vec<_> = rows.iter().enumerate().collect(); + sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); + UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32)) + }) + }) + }); +} + +fn add_benchmark(c: &mut Criterion) { + let cases: &[&[Column]] = &[ + &[Column::RequiredI32, Column::OptionalI32], + &[Column::RequiredI32, Column::Optional16CharString], + &[Column::RequiredI32, Column::Required16CharString], + &[Column::Optional16CharString, Column::Required16CharString], + &[ + Column::Optional16CharString, + Column::Optional50CharString, + Column::Required16CharString, + ], + &[ + Column::Optional16CharString, + Column::Required16CharString, + Column::Optional16CharString, + Column::Optional16CharString, + Column::Optional16CharString, + ], + &[ + Column::OptionalI32, + Column::Optional100Value50CharStringDict, + ], + &[ + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + ], + &[ + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Required16CharString, + ], + &[ + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Optional50CharString, + ], + &[ + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Optional100Value50CharStringDict, + Column::Optional50CharString, + ], + ]; + + for case in cases { + do_bench(c, *case, 4096); + do_bench(c, *case, 4096 * 8); + } +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index e2e20e75606..b297622647e 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -897,6 +897,10 @@ pub struct SortColumn { /// assert_eq!(as_primitive_array::(&sorted_columns[0]).value(1), -64); /// assert!(sorted_columns[0].is_null(0)); /// ``` +/// +/// Note: for multi-column sorts without a limit, using the [row format][crate::row] +/// may be significantly faster +/// pub fn lexsort(columns: &[SortColumn], limit: Option) -> Result> { let indices = lexsort_to_indices(columns, limit)?; columns @@ -907,6 +911,9 @@ pub fn lexsort(columns: &[SortColumn], limit: Option) -> Result, @@ -942,11 +949,8 @@ pub fn lexsort_to_indices( lexicographical_comparator.compare(a, b) }); - Ok(UInt32Array::from( - (&value_indices)[0..len] - .iter() - .map(|i| *i as u32) - .collect::>(), + Ok(UInt32Array::from_iter_values( + value_indices.iter().map(|i| *i as u32), )) } diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index c3aa9ea4c5a..3f11254adef 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -73,6 +73,24 @@ //! assert_eq!(&c2_values, &["a", "f", "c", "e"]); //! ``` //! +//! It can also be used to implement a fast lexicographic sort +//! +//! ``` +//! # use arrow::row::{RowConverter, SortField}; +//! # use arrow_array::{ArrayRef, UInt32Array}; +//! fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array { +//! let fields = arrays +//! .iter() +//! .map(|a| SortField::new(a.data_type().clone())) +//! .collect(); +//! let mut converter = RowConverter::new(fields); +//! let rows = converter.convert_columns(&arrays).unwrap(); +//! let mut sort: Vec<_> = rows.iter().enumerate().collect(); +//! sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); +//! UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32)) +//! } +//! ``` +//! //! [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts] //! [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort] //! [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf] diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 3b89e7982a6..d07443301c1 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -20,6 +20,8 @@ use crate::array::*; use crate::datatypes::*; use crate::util::test_util::seedable_rng; +use arrow_buffer::Buffer; +use rand::distributions::uniform::SampleUniform; use rand::Rng; use rand::SeedableRng; use rand::{ @@ -187,3 +189,39 @@ pub fn create_fsb_array( })) .unwrap() } + +/// Creates a random (but fixed-seeded) dictionary array of a given size and null density +/// with the provided values array +pub fn create_dict_from_values( + size: usize, + null_density: f32, + values: &dyn Array, +) -> DictionaryArray +where + K: ArrowDictionaryKeyType, + Standard: Distribution, + K::Native: SampleUniform, +{ + let mut rng = seedable_rng(); + let data_type = DataType::Dictionary( + Box::new(K::DATA_TYPE), + Box::new(values.data_type().clone()), + ); + + let min_key = K::Native::from_usize(0).unwrap(); + let max_key = K::Native::from_usize(values.len()).unwrap(); + let keys: Buffer = (0..size).map(|_| rng.gen_range(min_key..max_key)).collect(); + + let nulls: Option = (null_density != 0.) + .then(|| (0..size).map(|_| rng.gen_bool(null_density as _)).collect()); + + let data = ArrayDataBuilder::new(data_type) + .len(size) + .null_bit_buffer(nulls) + .add_buffer(keys) + .add_child_data(values.data().clone()) + .build() + .unwrap(); + + DictionaryArray::from(data) +} From b7165c0333e8236a4040cd02c97453df4a40310a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 26 Oct 2022 15:57:18 +1300 Subject: [PATCH 2/3] Format --- arrow/benches/lexsort.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arrow/benches/lexsort.rs b/arrow/benches/lexsort.rs index 8dc22de348e..3820007231a 100644 --- a/arrow/benches/lexsort.rs +++ b/arrow/benches/lexsort.rs @@ -89,9 +89,14 @@ fn do_bench(c: &mut Criterion, columns: &[Column], len: usize) { }) .collect(); - c.bench_function(&format!("lexsort_to_indices({:?}): {}", columns, len), |b| { - b.iter(|| criterion::black_box(lexsort_to_indices(&sort_columns, None).unwrap())) - }); + c.bench_function( + &format!("lexsort_to_indices({:?}): {}", columns, len), + |b| { + b.iter(|| { + criterion::black_box(lexsort_to_indices(&sort_columns, None).unwrap()) + }) + }, + ); c.bench_function(&format!("lexsort_rows({:?}): {}", columns, len), |b| { b.iter(|| { From dee727f0aac2e3dc61faead9dad5e382ce8be0c3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 27 Oct 2022 07:10:01 +1300 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Andrew Lamb --- arrow/src/row/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 3f11254adef..8af642240e7 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -73,7 +73,7 @@ //! assert_eq!(&c2_values, &["a", "f", "c", "e"]); //! ``` //! -//! It can also be used to implement a fast lexicographic sort +//! It can also be used to implement a fast multi-column / lexicographic sort //! //! ``` //! # use arrow::row::{RowConverter, SortField};