Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sort indices of dictionary string values #2698

Merged
merged 2 commits into from
Sep 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions arrow/benches/comparison_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,8 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$"))
});

let dict_arr_a = create_string_dict_array::<Int32Type>(size, 0.0);
let dict_arr_b = create_string_dict_array::<Int32Type>(size, 0.0);
let dict_arr_a = create_string_dict_array::<Int32Type>(size, 0.0, 4);
let dict_arr_b = create_string_dict_array::<Int32Type>(size, 0.0, 4);

c.bench_function("dict eq string", |b| {
b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b))
Expand Down
4 changes: 2 additions & 2 deletions arrow/benches/filter_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_built_filter(&sparse_filter, &data_array))
});

let data_array = create_string_dict_array::<Int32Type>(size, 0.0);
let data_array = create_string_dict_array::<Int32Type>(size, 0.0, 4);
c.bench_function("filter context string dictionary (kept 1/2)", |b| {
b.iter(|| bench_built_filter(&filter, &data_array))
});
Expand All @@ -168,7 +168,7 @@ fn add_benchmark(c: &mut Criterion) {
|b| b.iter(|| bench_built_filter(&sparse_filter, &data_array)),
);

let data_array = create_string_dict_array::<Int32Type>(size, 0.5);
let data_array = create_string_dict_array::<Int32Type>(size, 0.5, 4);
c.bench_function("filter context string dictionary w NULLs (kept 1/2)", |b| {
b.iter(|| bench_built_filter(&filter, &data_array))
});
Expand Down
15 changes: 15 additions & 0 deletions arrow/benches/sort_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ use std::sync::Arc;
extern crate arrow;

use arrow::compute::kernels::sort::{lexsort, SortColumn};
use arrow::compute::sort_to_indices;
use arrow::datatypes::Int32Type;
use arrow::util::bench_util::*;
use arrow::{array::*, datatypes::Float32Type};

Expand Down Expand Up @@ -55,6 +57,10 @@ fn bench_sort(array_a: &ArrayRef, array_b: &ArrayRef, limit: Option<usize>) {
criterion::black_box(lexsort(&columns, limit).unwrap());
}

fn bench_sort_to_indices(array: &ArrayRef, limit: Option<usize>) {
criterion::black_box(sort_to_indices(array, None, limit).unwrap());
}

fn add_benchmark(c: &mut Criterion) {
let arr_a = create_f32_array(2u64.pow(10) as usize, false);
let arr_b = create_f32_array(2u64.pow(10) as usize, false);
Expand Down Expand Up @@ -92,6 +98,15 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_sort(&arr_a, &arr_b, None))
});

let dict_arr = Arc::new(create_string_dict_array::<Int32Type>(
2u64.pow(12) as usize,
0.0,
1,
)) as ArrayRef;
c.bench_function("dict string 2^12", |b| {
b.iter(|| bench_sort_to_indices(&dict_arr, None))
});

// with limit
{
let arr_a = create_f32_array(2u64.pow(12) as usize, false);
Expand Down
37 changes: 23 additions & 14 deletions arrow/src/compute/kernels/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,18 @@ pub fn sort_to_indices(
let value_indices_map = prepare_indices_map(&sorted_value_indices);
sort_primitive_dictionary::<_, _>(values, &value_indices_map, v, n, options, limit, cmp)
},
DataType::Utf8 => sort_string_dictionary::<_>(values, v, n, &options, limit),
DataType::Utf8 => {
let dict_values = values.values();
let value_null_first = if options.descending {
!options.nulls_first
} else {
options.nulls_first
};
let value_options = Some(SortOptions { descending: false, nulls_first: value_null_first });
let sorted_value_indices = sort_to_indices(dict_values, value_options, None)?;
let value_indices_map = prepare_indices_map(&sorted_value_indices);
sort_string_dictionary::<_>(values, &value_indices_map, v, n, &options, limit)
},
t => return Err(ArrowError::ComputeError(format!(
"Unsupported dictionary value type {}", t
))),
Expand Down Expand Up @@ -753,27 +764,25 @@ fn sort_string<Offset: OffsetSizeTrait>(
/// Sort dictionary encoded strings
fn sort_string_dictionary<T: ArrowDictionaryKeyType>(
values: &DictionaryArray<T>,
value_indices_map: &HashMap<usize, u32>,
value_indices: Vec<u32>,
null_indices: Vec<u32>,
options: &SortOptions,
limit: Option<usize>,
) -> UInt32Array {
let keys: &PrimitiveArray<T> = values.keys();

let dict = values.values();
let dict: &StringArray = as_string_array(dict);
// create tuples that are used for sorting
let valids = value_indices
.into_iter()
.map(|index| {
let key: T::Native = keys.value(index as usize);
let value_order = value_indices_map.get(&key.to_usize().unwrap()).unwrap();
(index, *value_order)
})
.collect::<Vec<(u32, u32)>>();

sort_string_helper(
keys,
value_indices,
null_indices,
options,
limit,
|array: &PrimitiveArray<T>, idx| -> &str {
let key: T::Native = array.value(idx as usize);
dict.value(key.to_usize().unwrap())
},
)
sort_primitive_inner::<_, _>(keys.len(), null_indices, cmp, options, limit, valids)
}

/// shared implementation between dictionary encoded and plain string arrays
Expand Down
3 changes: 2 additions & 1 deletion arrow/src/util/bench_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
size: usize,
null_density: f32,
str_len: usize,
) -> DictionaryArray<K> {
let rng = &mut seedable_rng();

Expand All @@ -132,7 +133,7 @@ pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
if rng.gen::<f32>() < null_density {
None
} else {
let value = rng.sample_iter(&Alphanumeric).take(4).collect();
let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
let value = String::from_utf8(value).unwrap();
Some(value)
}
Expand Down