Skip to content
Permalink
Browse files
Speed up the offsets checking (#1684)
* simplify offsets fully checking

Signed-off-by: remzi <13716567376yh@gmail.com>

* update doc
rename to offsets buffer

Signed-off-by: remzi <13716567376yh@gmail.com>

* add more tests for empty binary-like array

Signed-off-by: remzi <13716567376yh@gmail.com>

* add docs for empty binary-like array

Signed-off-by: remzi <13716567376yh@gmail.com>

* Update arrow/src/array/data.rs

fix a nit in docs.

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>

* add benchmark

Signed-off-by: remzi <13716567376yh@gmail.com>

* replace windows with scan

Signed-off-by: remzi <13716567376yh@gmail.com>

* Update arrow/src/array/data.rs

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>

* Update arrow/src/array/data.rs

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>
  • Loading branch information
HaoYang670 and viirya committed May 15, 2022
1 parent 5b154ea commit ff182f167e83b1fc553e213ca208bb82a28dea91
Showing 3 changed files with 187 additions and 61 deletions.
@@ -176,3 +176,7 @@ harness = false
[[bench]]
name = "string_kernels"
harness = false

[[bench]]
name = "array_data_validate"
harness = false
@@ -0,0 +1,48 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#[macro_use]
extern crate criterion;
use criterion::Criterion;

extern crate arrow;

use arrow::{array::*, buffer::Buffer, datatypes::DataType};

fn create_binary_array_data(length: i32) -> ArrayData {
let value_buffer = Buffer::from_iter(0_i32..length);
let offsets_buffer = Buffer::from_iter(0_i32..length + 1);
ArrayData::try_new(
DataType::Binary,
length as usize,
None,
None,
0,
vec![offsets_buffer, value_buffer],
vec![],
)
.unwrap()
}

fn array_slice_benchmark(c: &mut Criterion) {
c.bench_function("validate_binary_array_data 20000", |b| {
b.iter(|| create_binary_array_data(20000))
});
}

criterion_group!(benches, array_slice_benchmark);
criterion_main!(benches);
@@ -726,19 +726,21 @@ impl ArrayData {
/// Returns a reference to the data in `buffer` as a typed slice
/// (typically `&[i32]` or `&[i64]`) after validating. The
/// returned slice is guaranteed to have at least `self.len + 1`
/// entries
/// entries.
///
/// For an empty array, the `buffer` can also be empty.
fn typed_offsets<'a, T: ArrowNativeType + num::Num + std::fmt::Display>(
&'a self,
buffer: &'a Buffer,
) -> Result<&'a [T]> {
// Validate that there are the correct number of offsets for this array's length
let required_offsets = self.len + self.offset + 1;

// An empty list-like array can have 0 offsets
if buffer.is_empty() {
if buffer.is_empty() && self.len == 0 {
return Ok(&[]);
}

// Validate that there are the correct number of offsets for this array's length
let required_offsets = self.len + self.offset + 1;

if (buffer.len() / std::mem::size_of::<T>()) < required_offsets {
return Err(ArrowError::InvalidArgumentError(format!(
"Offsets buffer size (bytes): {} isn't large enough for {}. Length {} needs {}",
@@ -1033,76 +1035,64 @@ impl ArrayData {
}

/// Calls the `validate(item_index, range)` function for each of
/// the ranges specified in the arrow offset buffer of type
/// the ranges specified in the arrow offsets buffer of type
/// `T`. Also validates that each offset is smaller than
/// `max_offset`
/// `offset_limit`
///
/// For an empty array, the offsets buffer can either be empty
/// or contain a single `0`.
///
/// For example, the offset buffer contained `[1, 2, 4]`, this
/// For example, the offsets buffer contained `[1, 2, 4]`, this
/// function would call `validate([1,2])`, and `validate([2,4])`
fn validate_each_offset<T, V>(
&self,
offset_buffer: &Buffer,
offsets_buffer: &Buffer,
offset_limit: usize,
validate: V,
) -> Result<()>
where
T: ArrowNativeType + std::convert::TryInto<usize> + num::Num + std::fmt::Display,
V: Fn(usize, Range<usize>) -> Result<()>,
{
// An empty binary-like array can have 0 offsets
if self.len == 0 && offset_buffer.is_empty() {
return Ok(());
}

let offsets = self.typed_offsets::<T>(offset_buffer)?;

offsets
self.typed_offsets::<T>(offsets_buffer)?
.iter()
.zip(offsets.iter().skip(1))
.enumerate()
.map(|(i, (&start_offset, &end_offset))| {
let start_offset: usize = start_offset
.try_into()
.map_err(|_| {
ArrowError::InvalidArgumentError(format!(
"Offset invariant failure: could not convert start_offset {} to usize in slot {}",
start_offset, i))
})?;
let end_offset: usize = end_offset
.try_into()
.map_err(|_| {
ArrowError::InvalidArgumentError(format!(
"Offset invariant failure: Could not convert end_offset {} to usize in slot {}",
end_offset, i+1))
})?;

if start_offset > offset_limit {
return Err(ArrowError::InvalidArgumentError(format!(
"Offset invariant failure: offset for slot {} out of bounds: {} > {}",
i, start_offset, offset_limit))
);
}

if end_offset > offset_limit {
return Err(ArrowError::InvalidArgumentError(format!(
"Offset invariant failure: offset for slot {} out of bounds: {} > {}",
i, end_offset, offset_limit))
.map(|(i, x)| {
// check if the offset can be converted to usize
let r = x.to_usize().ok_or_else(|| {
ArrowError::InvalidArgumentError(format!(
"Offset invariant failure: Could not convert offset {} to usize at position {}",
x, i))}
);
// check if the offset exceeds the limit
match r {
Ok(n) if n <= offset_limit => Ok((i, n)),
Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
"Offset invariant failure: offset at position {} out of bounds: {} > {}",
i, x, offset_limit))
),
Err(e) => Err(e),
}

// check range actually is low -> high
if start_offset > end_offset {
return Err(ArrowError::InvalidArgumentError(format!(
})
.scan(0_usize, |start, end| {
// check offsets are monotonically increasing
match end {
Ok((i, end)) if *start <= end => {
let range = Some(Ok((i, *start..end)));
*start = end;
range
}
Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
"Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
i, start_offset, end_offset))
);
i - 1, start, end))
)),
Err(err) => Some(Err(err)),
}

Ok((i, start_offset..end_offset))
})
.skip(1) // the first element is meaningless
.try_for_each(|res: Result<(usize, Range<usize>)>| {
let (item_index, range) = res?;
validate(item_index, range)
validate(item_index-1, range)
})
}

@@ -1821,6 +1811,90 @@ mod tests {
.unwrap();
}

#[test]
fn test_empty_utf8_array_with_empty_offsets_buffer() {
let data_buffer = Buffer::from(&[]);
let offsets_buffer = Buffer::from(&[]);
ArrayData::try_new(
DataType::Utf8,
0,
None,
None,
0,
vec![offsets_buffer, data_buffer],
vec![],
)
.unwrap();
}

#[test]
fn test_empty_utf8_array_with_single_zero_offset() {
let data_buffer = Buffer::from(&[]);
let offsets_buffer = Buffer::from_slice_ref(&[0i32]);
ArrayData::try_new(
DataType::Utf8,
0,
None,
None,
0,
vec![offsets_buffer, data_buffer],
vec![],
)
.unwrap();
}

#[test]
#[should_panic(expected = "First offset 1 of Utf8 is larger than values length 0")]
fn test_empty_utf8_array_with_invalid_offset() {
let data_buffer = Buffer::from(&[]);
let offsets_buffer = Buffer::from_slice_ref(&[1i32]);
ArrayData::try_new(
DataType::Utf8,
0,
None,
None,
0,
vec![offsets_buffer, data_buffer],
vec![],
)
.unwrap();
}

#[test]
fn test_empty_utf8_array_with_non_zero_offset() {
let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2, 6, 0]);
ArrayData::try_new(
DataType::Utf8,
0,
None,
None,
3,
vec![offsets_buffer, data_buffer],
vec![],
)
.unwrap();
}

#[test]
#[should_panic(
expected = "Offsets buffer size (bytes): 4 isn't large enough for LargeUtf8. Length 0 needs 1"
)]
fn test_empty_large_utf8_array_with_wrong_type_offsets() {
let data_buffer = Buffer::from(&[]);
let offsets_buffer = Buffer::from_slice_ref(&[0i32]);
ArrayData::try_new(
DataType::LargeUtf8,
0,
None,
None,
0,
vec![offsets_buffer, data_buffer],
vec![],
)
.unwrap();
}

#[test]
#[should_panic(
expected = "Offsets buffer size (bytes): 8 isn't large enough for Utf8. Length 2 needs 3"
@@ -2110,31 +2184,31 @@ mod tests {

#[test]
#[should_panic(
expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
)]
fn test_validate_utf8_out_of_bounds() {
check_index_out_of_bounds_validation::<i32>(DataType::Utf8);
}

#[test]
#[should_panic(
expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
)]
fn test_validate_large_utf8_out_of_bounds() {
check_index_out_of_bounds_validation::<i64>(DataType::LargeUtf8);
}

#[test]
#[should_panic(
expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
)]
fn test_validate_binary_out_of_bounds() {
check_index_out_of_bounds_validation::<i32>(DataType::Binary);
}

#[test]
#[should_panic(
expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
)]
fn test_validate_large_binary_out_of_bounds() {
check_index_out_of_bounds_validation::<i64>(DataType::LargeBinary);
@@ -2327,7 +2401,7 @@ mod tests {

#[test]
#[should_panic(
expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4"
expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4"
)]
fn test_validate_list_offsets() {
let field_type = Field::new("f", DataType::Int32, true);
@@ -2336,7 +2410,7 @@ mod tests {

#[test]
#[should_panic(
expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4"
expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4"
)]
fn test_validate_large_list_offsets() {
let field_type = Field::new("f", DataType::Int32, true);
@@ -2346,7 +2420,7 @@ mod tests {
/// Test that the list of type `data_type` generates correct errors for negative offsets
#[test]
#[should_panic(
expected = "Offset invariant failure: Could not convert end_offset -1 to usize in slot 2"
expected = "Offset invariant failure: Could not convert offset -1 to usize at position 2"
)]
fn test_validate_list_negative_offsets() {
let values: Int32Array =

0 comments on commit ff182f1

Please sign in to comment.