Skip to content

Commit

Permalink
Sort indices of dictionary string values (#2698)
Browse files Browse the repository at this point in the history
* Refactor dictionary string sorting

* Fix clippy
  • Loading branch information
viirya committed Sep 10, 2022
1 parent a1d24e4 commit 2f360e1
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 19 deletions.
4 changes: 2 additions & 2 deletions arrow/benches/comparison_kernels.rs
Expand Up @@ -287,8 +287,8 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$"))
});

let dict_arr_a = create_string_dict_array::<Int32Type>(size, 0.0);
let dict_arr_b = create_string_dict_array::<Int32Type>(size, 0.0);
let dict_arr_a = create_string_dict_array::<Int32Type>(size, 0.0, 4);
let dict_arr_b = create_string_dict_array::<Int32Type>(size, 0.0, 4);

c.bench_function("dict eq string", |b| {
b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b))
Expand Down
4 changes: 2 additions & 2 deletions arrow/benches/filter_kernels.rs
Expand Up @@ -155,7 +155,7 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_built_filter(&sparse_filter, &data_array))
});

let data_array = create_string_dict_array::<Int32Type>(size, 0.0);
let data_array = create_string_dict_array::<Int32Type>(size, 0.0, 4);
c.bench_function("filter context string dictionary (kept 1/2)", |b| {
b.iter(|| bench_built_filter(&filter, &data_array))
});
Expand All @@ -168,7 +168,7 @@ fn add_benchmark(c: &mut Criterion) {
|b| b.iter(|| bench_built_filter(&sparse_filter, &data_array)),
);

let data_array = create_string_dict_array::<Int32Type>(size, 0.5);
let data_array = create_string_dict_array::<Int32Type>(size, 0.5, 4);
c.bench_function("filter context string dictionary w NULLs (kept 1/2)", |b| {
b.iter(|| bench_built_filter(&filter, &data_array))
});
Expand Down
15 changes: 15 additions & 0 deletions arrow/benches/sort_kernel.rs
Expand Up @@ -24,6 +24,8 @@ use std::sync::Arc;
extern crate arrow;

use arrow::compute::kernels::sort::{lexsort, SortColumn};
use arrow::compute::sort_to_indices;
use arrow::datatypes::Int32Type;
use arrow::util::bench_util::*;
use arrow::{array::*, datatypes::Float32Type};

Expand Down Expand Up @@ -55,6 +57,10 @@ fn bench_sort(array_a: &ArrayRef, array_b: &ArrayRef, limit: Option<usize>) {
criterion::black_box(lexsort(&columns, limit).unwrap());
}

fn bench_sort_to_indices(array: &ArrayRef, limit: Option<usize>) {
criterion::black_box(sort_to_indices(array, None, limit).unwrap());
}

fn add_benchmark(c: &mut Criterion) {
let arr_a = create_f32_array(2u64.pow(10) as usize, false);
let arr_b = create_f32_array(2u64.pow(10) as usize, false);
Expand Down Expand Up @@ -92,6 +98,15 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_sort(&arr_a, &arr_b, None))
});

let dict_arr = Arc::new(create_string_dict_array::<Int32Type>(
2u64.pow(12) as usize,
0.0,
1,
)) as ArrayRef;
c.bench_function("dict string 2^12", |b| {
b.iter(|| bench_sort_to_indices(&dict_arr, None))
});

// with limit
{
let arr_a = create_f32_array(2u64.pow(12) as usize, false);
Expand Down
37 changes: 23 additions & 14 deletions arrow/src/compute/kernels/sort.rs
Expand Up @@ -419,7 +419,18 @@ pub fn sort_to_indices(
let value_indices_map = prepare_indices_map(&sorted_value_indices);
sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp)
},
DataType::Utf8 => sort_string_dictionary::<_>(values, v, n, &options, limit),
DataType::Utf8 => {
let dict_values = values.values();
let value_null_first = if options.descending {
!options.nulls_first
} else {
options.nulls_first
};
let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first });
let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?;
let value_indices_map = prepare_indices_map(&sorted_value_indices);
sort_string_dictionary::<_>(values, &value_indices_map, v, n, &options, limit)
},
t => return Err(ArrowError::ComputeError(format!(
"Unsupported dictionary value type {}", t
))),
Expand Down Expand Up @@ -753,27 +764,25 @@ fn sort_string<Offset: OffsetSizeTrait>(
/// Sort dictionary encoded strings
fn sort_string_dictionary<T: ArrowDictionaryKeyType>(
values: &DictionaryArray<T>,
value_indices_map: &HashMap<usize, u32>,
value_indices: Vec<u32>,
null_indices: Vec<u32>,
options: &SortOptions,
limit: Option<usize>,
) -> UInt32Array {
let keys: &PrimitiveArray<T> = values.keys();

let dict = values.values();
let dict: &StringArray = as_string_array(dict);
// create tuples that are used for sorting
let valids = value_indices
.into_iter()
.map(|index| {
let key: T::Native = keys.value(index as usize);
let value_order = value_indices_map.get(&key.to_usize().unwrap()).unwrap();
(index, *value_order)
})
.collect::<Vec<(u32, u32)>>();

sort_string_helper(
keys,
value_indices,
null_indices,
options,
limit,
|array: &PrimitiveArray<T>, idx| -> &str {
let key: T::Native = array.value(idx as usize);
dict.value(key.to_usize().unwrap())
},
)
sort_primitive_inner::<_, _>(keys.len(), null_indices, cmp, options, limit, valids)
}

/// shared implementation between dictionary encoded and plain string arrays
Expand Down
3 changes: 2 additions & 1 deletion arrow/src/util/bench_util.rs
Expand Up @@ -124,6 +124,7 @@ pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
size: usize,
null_density: f32,
str_len: usize,
) -> DictionaryArray<K> {
let rng = &mut seedable_rng();

Expand All @@ -132,7 +133,7 @@ pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
if rng.gen::<f32>() < null_density {
None
} else {
let value = rng.sample_iter(&Alphanumeric).take(4).collect();
let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
let value = String::from_utf8(value).unwrap();
Some(value)
}
Expand Down

0 comments on commit 2f360e1

Please sign in to comment.