apache · tustvold · Oct 26, 2022 · Oct 25, 2022 · Oct 26, 2022 · Oct 26, 2022
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
@@ -259,3 +259,8 @@ required-features = ["test_utils"]
 name = "bitwise_kernel"
 harness = false
 required-features = ["test_utils"]
+
+[[bench]]
+name = "lexsort"
+harness = false
+required-features = ["test_utils"]
diff --git a/arrow/benches/lexsort.rs b/arrow/benches/lexsort.rs
@@ -0,0 +1,171 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::compute::{lexsort_to_indices, SortColumn};
+use arrow::row::{RowConverter, SortField};
+use arrow::util::bench_util::{
+    create_dict_from_values, create_primitive_array, create_string_array_with_len,
+};
+use arrow_array::types::Int32Type;
+use arrow_array::{Array, ArrayRef, UInt32Array};
+use criterion::{criterion_group, criterion_main, Criterion};
+use std::sync::Arc;
+
+#[derive(Copy, Clone)]
+enum Column {
+    RequiredI32,
+    OptionalI32,
+    Required16CharString,
+    Optional16CharString,
+    Optional50CharString,
+    Optional100Value50CharStringDict,
+}
+
+impl std::fmt::Debug for Column {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            Column::RequiredI32 => "i32",
+            Column::OptionalI32 => "i32_opt",
+            Column::Required16CharString => "str(16)",
+            Column::Optional16CharString => "str_opt(16)",
+            Column::Optional50CharString => "str_opt(50)",
+            Column::Optional100Value50CharStringDict => "dict(100,str_opt(50))",
+        };
+        f.write_str(s)
+    }
+}
+
+impl Column {
+    fn generate(self, size: usize) -> ArrayRef {
+        match self {
+            Column::RequiredI32 => {
+                Arc::new(create_primitive_array::<Int32Type>(size, 0.))
+            }
+            Column::OptionalI32 => {
+                Arc::new(create_primitive_array::<Int32Type>(size, 0.2))
+            }
+            Column::Required16CharString => {
+                Arc::new(create_string_array_with_len::<i32>(size, 0., 16))
+            }
+            Column::Optional16CharString => {
+                Arc::new(create_string_array_with_len::<i32>(size, 0.2, 16))
+            }
+            Column::Optional50CharString => {
+                Arc::new(create_string_array_with_len::<i32>(size, 0., 50))
+            }
+            Column::Optional100Value50CharStringDict => {
+                Arc::new(create_dict_from_values::<Int32Type>(
+                    size,
+                    0.1,
+                    &create_string_array_with_len::<i32>(100, 0., 50),
+                ))
+            }
+        }
+    }
+}
+
+fn do_bench(c: &mut Criterion, columns: &[Column], len: usize) {
+    let arrays: Vec<_> = columns.iter().map(|x| x.generate(len)).collect();
+    let sort_columns: Vec<_> = arrays
+        .iter()
+        .cloned()
+        .map(|values| SortColumn {
+            values,
+            options: None,
+        })
+        .collect();
+
+    c.bench_function(
+        &format!("lexsort_to_indices({:?}): {}", columns, len),
+        |b| {
+            b.iter(|| {
+                criterion::black_box(lexsort_to_indices(&sort_columns, None).unwrap())
+            })
+        },
+    );
+
+    c.bench_function(&format!("lexsort_rows({:?}): {}", columns, len), |b| {
-    c.bench_function(&format!("lexsort_rows({:?}): {}", columns, len), |b| {
+    c.bench_function(&format!("RowFormat: lexsort_rows({:?}): {}", columns, len), |b| {
-    c.bench_function(&format!("lexsort_rows({:?}): {}", columns, len), |b| {
+    c.bench_function(&format!("RowFormat: lexsort_rows({:?}): {}", columns, len), |b| {
+        b.iter(|| {
+            criterion::black_box({
+                let fields = arrays
+                    .iter()
+                    .map(|a| SortField::new(a.data_type().clone()))
+                    .collect();
+                let mut converter = RowConverter::new(fields);
+                let rows = converter.convert_columns(&arrays).unwrap();
+                let mut sort: Vec<_> = rows.iter().enumerate().collect();
+                sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b));
+                UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32))
+            })
+        })
+    });
+}
+
+fn add_benchmark(c: &mut Criterion) {
+    let cases: &[&[Column]] = &[
+        &[Column::RequiredI32, Column::OptionalI32],
+        &[Column::RequiredI32, Column::Optional16CharString],
+        &[Column::RequiredI32, Column::Required16CharString],
+        &[Column::Optional16CharString, Column::Required16CharString],
+        &[
+            Column::Optional16CharString,
+            Column::Optional50CharString,
+            Column::Required16CharString,
+        ],
+        &[
+            Column::Optional16CharString,
+            Column::Required16CharString,
+            Column::Optional16CharString,
+            Column::Optional16CharString,
+            Column::Optional16CharString,
+        ],
+        &[
+            Column::OptionalI32,
+            Column::Optional100Value50CharStringDict,
+        ],
+        &[
+            Column::Optional100Value50CharStringDict,
+            Column::Optional100Value50CharStringDict,
+        ],
+        &[
+            Column::Optional100Value50CharStringDict,
+            Column::Optional100Value50CharStringDict,
+            Column::Optional100Value50CharStringDict,
+            Column::Required16CharString,
+        ],
+        &[
+            Column::Optional100Value50CharStringDict,
+            Column::Optional100Value50CharStringDict,
+            Column::Optional100Value50CharStringDict,
+            Column::Optional50CharString,
+        ],
+        &[
+            Column::Optional100Value50CharStringDict,
+            Column::Optional100Value50CharStringDict,
+            Column::Optional100Value50CharStringDict,
+            Column::Optional50CharString,
+        ],
+    ];
+
+    for case in cases {
+        do_bench(c, *case, 4096);
+        do_bench(c, *case, 4096 * 8);
+    }
+}
+
+criterion_group!(benches, add_benchmark);
+criterion_main!(benches);
diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs
@@ -897,6 +897,10 @@ pub struct SortColumn {
 /// assert_eq!(as_primitive_array::<Int64Type>(&sorted_columns[0]).value(1), -64);
 /// assert!(sorted_columns[0].is_null(0));
 /// ```
+///
+/// Note: for multi-column sorts without a limit, using the [row format][crate::row]
+/// may be significantly faster
+///
 pub fn lexsort(columns: &[SortColumn], limit: Option<usize>) -> Result<Vec<ArrayRef>> {
     let indices = lexsort_to_indices(columns, limit)?;
     columns
@@ -907,6 +911,9 @@ pub fn lexsort(columns: &[SortColumn], limit: Option<usize>) -> Result<Vec<Array
 
 /// Sort elements lexicographically from a list of `ArrayRef` into an unsigned integer
 /// (`UInt32Array`) of indices.
+///
+/// Note: for multi-column sorts without a limit, using the [row format][crate::row]
+/// may be significantly faster
 pub fn lexsort_to_indices(
     columns: &[SortColumn],
     limit: Option<usize>,
@@ -942,11 +949,8 @@ pub fn lexsort_to_indices(
         lexicographical_comparator.compare(a, b)
     });
 
-    Ok(UInt32Array::from(
-        (&value_indices)[0..len]
-            .iter()
-            .map(|i| *i as u32)
-            .collect::<Vec<u32>>(),
+    Ok(UInt32Array::from_iter_values(
+        value_indices.iter().map(|i| *i as u32),
     ))
 }
 

diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs
@@ -73,6 +73,24 @@
 //! assert_eq!(&c2_values, &["a", "f", "c", "e"]);
 //! ```
 //!
+//! It can also be used to implement a fast multi-column / lexicographic sort
+//!
+//! ```
+//! # use arrow::row::{RowConverter, SortField};
+//! # use arrow_array::{ArrayRef, UInt32Array};
+//! fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array {
+//!     let fields = arrays
+//!         .iter()
+//!         .map(|a| SortField::new(a.data_type().clone()))
+//!         .collect();
+//!     let mut converter = RowConverter::new(fields);
+//!     let rows = converter.convert_columns(&arrays).unwrap();
+//!     let mut sort: Vec<_> = rows.iter().enumerate().collect();
+//!     sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b));
+//!     UInt32Array::from_iter_values(sort.iter().map(|(i, _)| *i as u32))
+//! }
+//! ```
+//!
 //! [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts]
 //! [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort]
 //! [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf]

diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs
@@ -20,6 +20,8 @@
 use crate::array::*;
 use crate::datatypes::*;
 use crate::util::test_util::seedable_rng;
+use arrow_buffer::Buffer;
+use rand::distributions::uniform::SampleUniform;
 use rand::Rng;
 use rand::SeedableRng;
 use rand::{
@@ -187,3 +189,39 @@ pub fn create_fsb_array(
     }))
     .unwrap()
 }
+
+/// Creates a random (but fixed-seeded) dictionary array of a given size and null density
+/// with the provided values array
+pub fn create_dict_from_values<K>(
+    size: usize,
+    null_density: f32,
+    values: &dyn Array,
+) -> DictionaryArray<K>
+where
+    K: ArrowDictionaryKeyType,
+    Standard: Distribution<K::Native>,
+    K::Native: SampleUniform,
+{
+    let mut rng = seedable_rng();
+    let data_type = DataType::Dictionary(
+        Box::new(K::DATA_TYPE),
+        Box::new(values.data_type().clone()),
+    );
+
+    let min_key = K::Native::from_usize(0).unwrap();
+    let max_key = K::Native::from_usize(values.len()).unwrap();
+    let keys: Buffer = (0..size).map(|_| rng.gen_range(min_key..max_key)).collect();
+
+    let nulls: Option<Buffer> = (null_density != 0.)
+        .then(|| (0..size).map(|_| rng.gen_bool(null_density as _)).collect());
+
+    let data = ArrayDataBuilder::new(data_type)
+        .len(size)
+        .null_bit_buffer(nulls)
+        .add_buffer(keys)
+        .add_child_data(values.data().clone())
+        .build()
+        .unwrap();
+
+    DictionaryArray::from(data)
+}