Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-35749: [C++] Handle run-end encoded filters in compute kernels #35750

Merged
merged 16 commits into from
Jun 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cpp/src/arrow/compute/api_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,12 +254,18 @@ namespace internal {
// These internal functions are implemented in kernels/vector_selection.cc

/// \brief Return the number of selected indices in the boolean filter
///
/// \param filter a plain or run-end encoded boolean array with or without nulls
/// \param null_selection how to handle nulls in the filter
ARROW_EXPORT
int64_t GetFilterOutputSize(const ArraySpan& filter,
FilterOptions::NullSelectionBehavior null_selection);

/// \brief Compute uint64 selection indices for use with Take given a boolean
/// filter
///
/// \param filter a plain or run-end encoded boolean array with or without nulls
/// \param null_selection how to handle nulls in the filter
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> GetTakeIndices(
const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/compute/kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,10 @@ std::shared_ptr<TypeMatcher> RunEndEncoded(
std::move(value_type_matcher));
}

std::shared_ptr<TypeMatcher> RunEndEncoded(Type::type value_type_id) {
return RunEndEncoded(SameTypeId(value_type_id));
}

std::shared_ptr<TypeMatcher> RunEndEncoded(
std::shared_ptr<TypeMatcher> run_end_type_matcher,
std::shared_ptr<TypeMatcher> value_type_matcher) {
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/compute/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,12 @@ ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndInteger();
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
std::shared_ptr<TypeMatcher> value_type_matcher);

/// \brief Match run-end encoded types that use any valid run-end type and
/// encode specific value types
///
/// @param[in] value_type_id a type id that the type of the values field should match
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(Type::type value_type_id);

/// \brief Match run-end encoded types that encode specific run-end and value types
///
/// @param[in] run_end_type_matcher a matcher that is applied to the run_ends field
Expand Down
25 changes: 25 additions & 0 deletions cpp/src/arrow/compute/kernels/ree_util_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,18 @@ class ReadWriteValue<ArrowType, in_has_validity_buffer, out_has_validity_buffer,
return valid;
}

/// Pre-conditions guaranteed by the callers:
/// - i and j are valid indices into the values buffer
/// - the values in i and j are valid
bool CompareValuesAt(int64_t i, int64_t j) const {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand what this is doing in REE utils? This is essentially representing value access in primitive arrays.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comparing values is commonly used when run-end encoding kernels. I would happily move it out of here if you have a suggestion.

if constexpr (std::is_same_v<ArrowType, BooleanType>) {
return bit_util::GetBit(input_values_, i) == bit_util::GetBit(input_values_, j);
} else {
return (reinterpret_cast<const ValueRepr*>(input_values_))[i] ==
(reinterpret_cast<const ValueRepr*>(input_values_))[j];
}
}

/// \brief Ensure padding is zeroed in validity bitmap.
void ZeroValidityPadding(int64_t length) const {
DCHECK(output_values_);
Expand Down Expand Up @@ -166,6 +178,11 @@ class ReadWriteValue<ArrowType, in_has_validity_buffer, out_has_validity_buffer,
return valid;
}

bool CompareValuesAt(int64_t i, int64_t j) const {
return 0 == memcmp(input_values_ + (i * byte_width_),
input_values_ + (j * byte_width_), byte_width_);
}

/// \brief Ensure padding is zeroed in validity bitmap.
void ZeroValidityPadding(int64_t length) const {
DCHECK(output_values_);
Expand Down Expand Up @@ -253,6 +270,14 @@ class ReadWriteValue<ArrowType, in_has_validity_buffer, out_has_validity_buffer,
return valid;
}

bool CompareValuesAt(int64_t i, int64_t j) const {
const offset_type len_i = input_offsets_[i + 1] - input_offsets_[i];
const offset_type len_j = input_offsets_[j + 1] - input_offsets_[j];
return len_i == len_j &&
memcmp(input_values_ + input_offsets_[i], input_values_ + input_offsets_[j],
static_cast<size_t>(len_i));
}

/// \brief Ensure padding is zeroed in validity bitmap.
void ZeroValidityPadding(int64_t length) const {
DCHECK(output_values_);
Expand Down
7 changes: 3 additions & 4 deletions cpp/src/arrow/compute/kernels/vector_selection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -332,8 +332,8 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
VectorKernel filter_base;
filter_base.init = FilterState::Init;
RegisterSelectionFunction("array_filter", array_filter_doc, filter_base,
/*selection_type=*/boolean(), filter_kernels,
GetDefaultFilterOptions(), registry);
std::move(filter_kernels), GetDefaultFilterOptions(),
registry);

DCHECK_OK(registry->AddFunction(MakeFilterMetaFunction()));

Expand All @@ -345,8 +345,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
take_base.init = TakeState::Init;
take_base.can_execute_chunkwise = false;
RegisterSelectionFunction("array_take", array_take_doc, take_base,
/*selection_type=*/match::Integer(), take_kernels,
GetDefaultTakeOptions(), registry);
std::move(take_kernels), GetDefaultTakeOptions(), registry);

DCHECK_OK(registry->AddFunction(MakeTakeMetaFunction()));

Expand Down
Loading
Loading