From ec16ddb75dba4b46f9a0221f699aa022cac71c0d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 10 Sep 2022 12:35:12 -0700 Subject: [PATCH 1/2] Refactor dictionary string sorting --- arrow/benches/comparison_kernels.rs | 4 ++-- arrow/benches/filter_kernels.rs | 4 ++-- arrow/benches/sort_kernel.rs | 15 ++++++++++++ arrow/src/compute/kernels/sort.rs | 37 ++++++++++++++++++----------- arrow/src/util/bench_util.rs | 3 ++- 5 files changed, 44 insertions(+), 19 deletions(-) diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 21d83e07eec..4ad139b879f 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -287,8 +287,8 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$")) }); - let dict_arr_a = create_string_dict_array::(size, 0.0); - let dict_arr_b = create_string_dict_array::(size, 0.0); + let dict_arr_a = create_string_dict_array::(size, 0.0, 4); + let dict_arr_b = create_string_dict_array::(size, 0.0, 4); c.bench_function("dict eq string", |b| { b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b)) diff --git a/arrow/benches/filter_kernels.rs b/arrow/benches/filter_kernels.rs index be6d9027a8d..bd612994663 100644 --- a/arrow/benches/filter_kernels.rs +++ b/arrow/benches/filter_kernels.rs @@ -155,7 +155,7 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_built_filter(&sparse_filter, &data_array)) }); - let data_array = create_string_dict_array::(size, 0.0); + let data_array = create_string_dict_array::(size, 0.0, 4); c.bench_function("filter context string dictionary (kept 1/2)", |b| { b.iter(|| bench_built_filter(&filter, &data_array)) }); @@ -168,7 +168,7 @@ fn add_benchmark(c: &mut Criterion) { |b| b.iter(|| bench_built_filter(&sparse_filter, &data_array)), ); - let data_array = create_string_dict_array::(size, 0.5); + let data_array = create_string_dict_array::(size, 0.5, 4); c.bench_function("filter context string dictionary w NULLs (kept 1/2)", |b| { b.iter(|| bench_built_filter(&filter, &data_array)) }); diff --git a/arrow/benches/sort_kernel.rs b/arrow/benches/sort_kernel.rs index f9f5f24c15a..c4c6819df09 100644 --- a/arrow/benches/sort_kernel.rs +++ b/arrow/benches/sort_kernel.rs @@ -24,6 +24,8 @@ use std::sync::Arc; extern crate arrow; use arrow::compute::kernels::sort::{lexsort, SortColumn}; +use arrow::compute::sort_to_indices; +use arrow::datatypes::Int32Type; use arrow::util::bench_util::*; use arrow::{array::*, datatypes::Float32Type}; @@ -55,6 +57,10 @@ fn bench_sort(array_a: &ArrayRef, array_b: &ArrayRef, limit: Option) { criterion::black_box(lexsort(&columns, limit).unwrap()); } +fn bench_sort_to_indices(array: &ArrayRef, limit: Option) { + criterion::black_box(sort_to_indices(array, None, limit).unwrap()); +} + fn add_benchmark(c: &mut Criterion) { let arr_a = create_f32_array(2u64.pow(10) as usize, false); let arr_b = create_f32_array(2u64.pow(10) as usize, false); @@ -92,6 +98,15 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_sort(&arr_a, &arr_b, None)) }); + let dict_arr = Arc::new(create_string_dict_array::( + 2u64.pow(12) as usize, + 0.0, + 1, + )) as ArrayRef; + c.bench_function("dict string 2^12", |b| { + b.iter(|| bench_sort_to_indices(&dict_arr, None)) + }); + // with limit { let arr_a = create_f32_array(2u64.pow(12) as usize, false); diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 7a2d47786af..257c2bc2206 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -419,7 +419,18 @@ pub fn sort_to_indices( let value_indices_map = prepare_indices_map(&sorted_value_indices); sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp) }, - DataType::Utf8 => sort_string_dictionary::<_>(values, v, n, &options, limit), + DataType::Utf8 => { + let dict_values = values.values(); + let value_null_first = if options.descending { + !options.nulls_first + } else { + options.nulls_first + }; + let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first }); + let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?; + let value_indices_map = prepare_indices_map(&sorted_value_indices); + sort_string_dictionary::<_>(values, &value_indices_map, v, n, &options, limit) + }, t => return Err(ArrowError::ComputeError(format!( "Unsupported dictionary value type {}", t ))), @@ -753,6 +764,7 @@ fn sort_string( /// Sort dictionary encoded strings fn sort_string_dictionary( values: &DictionaryArray, + value_indices_map: &HashMap, value_indices: Vec, null_indices: Vec, options: &SortOptions, @@ -760,20 +772,17 @@ fn sort_string_dictionary( ) -> UInt32Array { let keys: &PrimitiveArray = values.keys(); - let dict = values.values(); - let dict: &StringArray = as_string_array(dict); + // create tuples that are used for sorting + let valids = value_indices + .into_iter() + .map(|index| { + let key: T::Native = keys.value(index as usize); + let value_order = value_indices_map.get(&key.to_usize().unwrap()).unwrap(); + (index, *value_order) + }) + .collect::>(); - sort_string_helper( - keys, - value_indices, - null_indices, - options, - limit, - |array: &PrimitiveArray, idx| -> &str { - let key: T::Native = array.value(idx as usize); - dict.value(key.to_usize().unwrap()) - }, - ) + sort_primitive_inner::<_, _>(keys.len(), null_indices, cmp, &options, limit, valids) } /// shared implementation between dictionary encoded and plain string arrays diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 395f3702d57..3b89e7982a6 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -124,6 +124,7 @@ pub fn create_string_array_with_len( pub fn create_string_dict_array( size: usize, null_density: f32, + str_len: usize, ) -> DictionaryArray { let rng = &mut seedable_rng(); @@ -132,7 +133,7 @@ pub fn create_string_dict_array( if rng.gen::() < null_density { None } else { - let value = rng.sample_iter(&Alphanumeric).take(4).collect(); + let value = rng.sample_iter(&Alphanumeric).take(str_len).collect(); let value = String::from_utf8(value).unwrap(); Some(value) } From 99aacc9a9cd17e309674f9d2ecdb59c6700a7827 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 10 Sep 2022 13:44:06 -0700 Subject: [PATCH 2/2] Fix clippy --- arrow/src/compute/kernels/sort.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 257c2bc2206..2ad9e817fed 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -782,7 +782,7 @@ fn sort_string_dictionary( }) .collect::>(); - sort_primitive_inner::<_, _>(keys.len(), null_indices, cmp, &options, limit, valids) + sort_primitive_inner::<_, _>(keys.len(), null_indices, cmp, options, limit, valids) } /// shared implementation between dictionary encoded and plain string arrays