Skip to content

Commit

Permalink
GH-41596: [C++] fixed_width_internal.h: Simplify docstring and suppor…
Browse files Browse the repository at this point in the history
…t bit-sized types (BOOL) (#41597)

### Rationale for this change

Post-merge feedback from #41297.

### What changes are included in this PR?

 - Supporting `BOOL` as both a top-level and nested in FSL types
 - Removing the long example from the docstring of `IsFixedWidthLike`

These changes don't affect users because this header was added recently and not released.

### Are these changes tested?

Yes, by existing and new test cases.
* GitHub Issue: #41596

Authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Signed-off-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
  • Loading branch information
felipecrv committed May 14, 2024
1 parent e411e0e commit 657c4fa
Show file tree
Hide file tree
Showing 6 changed files with 212 additions and 208 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ class PrimitiveFilterImpl {
values_is_valid_(values.buffers[0].data),
// No offset applied for boolean because it's a bitmap
values_data_(kIsBoolean ? values.buffers[1].data
: util::OffsetPointerOfFixedWidthValues(values)),
: util::OffsetPointerOfFixedByteWidthValues(values)),
values_null_count_(values.null_count),
values_offset_(values.offset),
values_length_(values.length),
Expand Down Expand Up @@ -470,7 +470,7 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
// validity bitmap.
const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero;

DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false));
DCHECK(util::IsFixedWidthLike(values));
const int64_t bit_width = util::FixedWidthInBits(*values.type);
RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData(
ctx, output_length, /*source=*/values, allocate_validity, out_arr));
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compute/kernels/vector_selection_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -898,7 +898,7 @@ Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out)
// PrimitiveFilterExec for a fixed-size list array.
if (util::IsFixedWidthLike(values,
/*force_null_count=*/true,
/*exclude_dictionary=*/true)) {
/*exclude_bool_and_dictionary=*/true)) {
const auto byte_width = util::FixedWidthInBytes(*values.type);
// 0 is a valid byte width for FixedSizeList, but PrimitiveFilterExec
// might not handle it correctly.
Expand Down Expand Up @@ -971,7 +971,7 @@ Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
// PrimitiveTakeExec for a fixed-size list array.
if (util::IsFixedWidthLike(values,
/*force_null_count=*/true,
/*exclude_dictionary=*/true)) {
/*exclude_bool_and_dictionary=*/true)) {
const auto byte_width = util::FixedWidthInBytes(*values.type);
// Additionally, PrimitiveTakeExec is only implemented for specific byte widths.
// TODO(GH-41301): Extend PrimitiveTakeExec for any fixed-width type.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ struct PrimitiveTakeImpl {
static void Exec(const ArraySpan& values, const ArraySpan& indices,
ArrayData* out_arr) {
DCHECK_EQ(util::FixedWidthInBytes(*values.type), kValueWidth);
const auto* values_data = util::OffsetPointerOfFixedWidthValues(values);
const auto* values_data = util::OffsetPointerOfFixedByteWidthValues(values);
const uint8_t* values_is_valid = values.buffers[0].data;
auto values_offset = values.offset;

Expand Down Expand Up @@ -588,8 +588,7 @@ Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*

ArrayData* out_arr = out->array_data().get();

DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false,
/*exclude_dictionary=*/true));
DCHECK(util::IsFixedWidthLike(values));
const int64_t bit_width = util::FixedWidthInBits(*values.type);

// TODO: When neither values nor indices contain nulls, we can skip
Expand Down
100 changes: 53 additions & 47 deletions cpp/src/arrow/util/fixed_width_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,12 @@ namespace arrow::util {
using ::arrow::internal::checked_cast;

bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count,
bool exclude_dictionary) {
return IsFixedWidthLike(source, force_null_count,
[exclude_dictionary](const DataType& type) {
return !exclude_dictionary || type.id() != Type::DICTIONARY;
});
bool exclude_bool_and_dictionary) {
return IsFixedWidthLike(
source, force_null_count, [exclude_bool_and_dictionary](const DataType& type) {
return !exclude_bool_and_dictionary ||
(type.id() != Type::DICTIONARY && type.id() != Type::BOOL);
});
}

static int64_t FixedWidthInBytesFallback(const FixedSizeListType& fixed_size_list_type) {
Expand Down Expand Up @@ -73,16 +74,37 @@ int64_t FixedWidthInBytes(const DataType& type) {
return -1;
}

static int64_t FixedWidthInBitsFallback(const FixedSizeListType& fixed_size_list_type) {
auto* fsl = &fixed_size_list_type;
int64_t list_size = fsl->list_size();
for (auto type = fsl->value_type().get();;) {
auto type_id = type->id();
if (type_id == Type::FIXED_SIZE_LIST) {
fsl = checked_cast<const FixedSizeListType*>(type);
list_size *= fsl->list_size();
type = fsl->value_type().get();
continue;
}
if (is_fixed_width(type_id)) {
const int64_t flat_bit_width = list_size * type->bit_width();
DCHECK_GE(flat_bit_width, 0);
return flat_bit_width;
}
break;
}
return -1;
}

int64_t FixedWidthInBits(const DataType& type) {
auto type_id = type.id();
if (is_fixed_width(type_id)) {
return type.bit_width();
}
const int64_t byte_width = FixedWidthInBytes(type);
if (ARROW_PREDICT_FALSE(byte_width < 0)) {
return -1;
if (type_id == Type::FIXED_SIZE_LIST) {
auto& fsl = ::arrow::internal::checked_cast<const FixedSizeListType&>(type);
return FixedWidthInBitsFallback(fsl);
}
return byte_width * 8;
return -1;
}

namespace internal {
Expand Down Expand Up @@ -121,9 +143,6 @@ Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx,
if (type->id() == Type::FIXED_SIZE_LIST) {
auto& fsl_type = checked_cast<const FixedSizeListType&>(*type);
auto& value_type = fsl_type.value_type();
if (ARROW_PREDICT_FALSE(value_type->id() == Type::BOOL)) {
return Status::Invalid("PreallocateFixedWidthArrayData: Invalid type: ", fsl_type);
}
if (ARROW_PREDICT_FALSE(value_type->id() == Type::DICTIONARY)) {
return Status::NotImplemented(
"PreallocateFixedWidthArrayData: DICTIONARY type allocation: ", *type);
Expand All @@ -146,16 +165,13 @@ Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx,

} // namespace internal

/// \pre same as OffsetPointerOfFixedWidthValues
/// \pre source.type->id() != Type::BOOL
static const uint8_t* OffsetPointerOfFixedWidthValuesFallback(const ArraySpan& source) {
std::pair<int, const uint8_t*> OffsetPointerOfFixedBitWidthValues(
const ArraySpan& source) {
using OffsetAndListSize = std::pair<int64_t, int64_t>;
auto get_offset = [](auto pair) { return pair.first; };
auto get_list_size = [](auto pair) { return pair.second; };
::arrow::internal::SmallVector<OffsetAndListSize, 1> stack;

DCHECK_NE(source.type->id(), Type::BOOL);

int64_t list_size = 1;
auto* array = &source;
while (array->type->id() == Type::FIXED_SIZE_LIST) {
Expand All @@ -166,31 +182,25 @@ static const uint8_t* OffsetPointerOfFixedWidthValuesFallback(const ArraySpan& s
// Now that innermost values were reached, pop the stack and calculate the offset
// in bytes of the innermost values buffer by considering the offset at each
// level of nesting.
DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type));
DCHECK(is_fixed_width(*array->type));
DCHECK(array == &source || !array->MayHaveNulls())
<< "OffsetPointerOfFixedWidthValues: array is expected to be flat or have no "
"nulls in the arrays nested by FIXED_SIZE_LIST.";
int64_t value_width = array->type->byte_width();
int64_t offset_in_bytes = array->offset * value_width;
int64_t value_width_in_bits = array->type->bit_width();
int64_t offset_in_bits = array->offset * value_width_in_bits;
for (auto it = stack.rbegin(); it != stack.rend(); ++it) {
value_width *= get_list_size(*it);
offset_in_bytes += get_offset(*it) * value_width;
value_width_in_bits *= get_list_size(*it);
offset_in_bits += get_offset(*it) * value_width_in_bits;
}
return value_width < 0 ? nullptr : array->GetValues<uint8_t>(1, offset_in_bytes);
DCHECK_GE(value_width_in_bits, 0);
const auto* values_ptr = array->GetValues<uint8_t>(1, 0);
return {static_cast<int>(offset_in_bits % 8), values_ptr + (offset_in_bits / 8)};
}

const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source) {
auto type_id = source.type->id();
if (is_fixed_width(type_id)) {
if (ARROW_PREDICT_FALSE(type_id == Type::BOOL)) {
// BOOL arrays are bit-packed, thus a byte-aligned pointer cannot be produced in the
// general case. Returning something for BOOL arrays that happen to byte-align
// because offset=0 would create too much confusion.
return nullptr;
}
return source.GetValues<uint8_t>(1, 0) + source.offset * source.type->byte_width();
}
return OffsetPointerOfFixedWidthValuesFallback(source);
const uint8_t* OffsetPointerOfFixedByteWidthValues(const ArraySpan& source) {
DCHECK(IsFixedWidthLike(source, /*force_null_count=*/false,
[](const DataType& type) { return type.id() != Type::BOOL; }));
return OffsetPointerOfFixedBitWidthValues(source).second;
}

/// \brief Get the mutable pointer to the fixed-width values of an array
Expand All @@ -203,24 +213,20 @@ const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source) {
/// \return The mutable pointer to the fixed-width byte blocks of the array. If
/// pre-conditions are not satisfied, the return values is undefined.
uint8_t* MutableFixedWidthValuesPointer(ArrayData* mutable_array) {
auto type_id = mutable_array->type->id();
if (type_id == Type::FIXED_SIZE_LIST) {
auto* array = mutable_array;
do {
DCHECK_EQ(array->offset, 0);
DCHECK_EQ(array->child_data.size(), 1) << array->type->ToString(true) << " part of "
<< mutable_array->type->ToString(true);
array = array->child_data[0].get();
} while (array->type->id() == Type::FIXED_SIZE_LIST);
auto* array = mutable_array;
auto type_id = array->type->id();
while (type_id == Type::FIXED_SIZE_LIST) {
DCHECK_EQ(array->offset, 0);
DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type));
return array->GetMutableValues<uint8_t>(1, 0);
DCHECK_EQ(array->child_data.size(), 1) << array->type->ToString(true) << " part of "
<< mutable_array->type->ToString(true);
array = array->child_data[0].get();
type_id = array->type->id();
}
DCHECK_EQ(mutable_array->offset, 0);
// BOOL is allowed here only because the offset is expected to be 0,
// so the byte-aligned pointer also points to the first *bit* of the buffer.
DCHECK(is_fixed_width(type_id));
return mutable_array->GetMutableValues<uint8_t>(1, 0);
return array->GetMutableValues<uint8_t>(1, 0);
}

} // namespace arrow::util
Loading

0 comments on commit 657c4fa

Please sign in to comment.