Skip to content

Commit

Permalink
speed up substring_by_char by about 2.5x (#1832)
Browse files Browse the repository at this point in the history
* speed up substring_by_char

Signed-off-by: remzi <13716567376yh@gmail.com>

* better estimate the length of value buffer

Signed-off-by: remzi <13716567376yh@gmail.com>
  • Loading branch information
HaoYang670 committed Jun 11, 2022
1 parent 36ceb22 commit c08e532
Showing 1 changed file with 62 additions and 17 deletions.
79 changes: 62 additions & 17 deletions arrow/src/compute/kernels/substring.rs
Expand Up @@ -182,24 +182,69 @@ pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
start: i64,
length: Option<u64>,
) -> Result<GenericStringArray<OffsetSize>> {
Ok(array
.iter()
.map(|val| {
val.map(|val| {
let char_count = val.chars().count();
let start = if start >= 0 {
start.to_usize().unwrap().min(char_count)
} else {
char_count - (-start).to_usize().unwrap().min(char_count)
};
let length = length.map_or(char_count - start, |length| {
length.to_usize().unwrap().min(char_count - start)
});
let mut vals = BufferBuilder::<u8>::new({
let offsets = array.value_offsets();
(offsets[array.len()] - offsets[0]).to_usize().unwrap()
});
let mut new_offsets = BufferBuilder::<OffsetSize>::new(array.len() + 1);
new_offsets.append(OffsetSize::zero());
let length = length.map(|len| len.to_usize().unwrap());

array.iter().for_each(|val| {
if let Some(val) = val {
let char_count = val.chars().count();
let start = if start >= 0 {
start.to_usize().unwrap()
} else {
char_count - (-start).to_usize().unwrap().min(char_count)
};
let (start_offset, end_offset) = get_start_end_offset(val, start, length);
vals.append_slice(&val.as_bytes()[start_offset..end_offset]);
}
new_offsets.append(OffsetSize::from_usize(vals.len()).unwrap());
});
let data = unsafe {
ArrayData::new_unchecked(
GenericStringArray::<OffsetSize>::get_data_type(),
array.len(),
None,
array
.data_ref()
.null_buffer()
.map(|b| b.bit_slice(array.offset(), array.len())),
0,
vec![new_offsets.finish(), vals.finish()],
vec![],
)
};
Ok(GenericStringArray::<OffsetSize>::from(data))
}

val.chars().skip(start).take(length).collect::<String>()
})
})
.collect::<GenericStringArray<OffsetSize>>())
/// * `val` - string
/// * `start` - the start char index of the substring
/// * `length` - the char length of the substring
///
/// Return the `start` and `end` offset (by byte) of the substring
fn get_start_end_offset(
val: &str,
start: usize,
length: Option<usize>,
) -> (usize, usize) {
let len = val.len();
let mut offset_char_iter = val.char_indices();
let start_offset = offset_char_iter
.nth(start)
.map_or(len, |(offset, _)| offset);
let end_offset = length.map_or(len, |length| {
if length > 0 {
offset_char_iter
.nth(length - 1)
.map_or(len, |(offset, _)| offset)
} else {
start_offset
}
});
(start_offset, end_offset)
}

fn binary_substring<OffsetSize: OffsetSizeTrait>(
Expand Down

0 comments on commit c08e532

Please sign in to comment.