Skip to content

Commit

Permalink
ARROW-17450: [C++][Parquet] Add support for uint8 boolean decode in a…
Browse files Browse the repository at this point in the history
…ddition to bool array (#14359)

Commit [4660180](4660180) added support for RLE boolean decoder. We refactored some additional code making it streamlined with other cases for decoder. 

However there was a downstream dependency for a Decode function which taken in an array of `uint8` instead of `bool`. To not break any existing workload, adding back support for decode boolean datatype with array of `uint8`

Lead-authored-by: sfc-gh-nthimmegowda <nishanth.thimmegowda@snowflake.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
sfc-gh-nthimmegowda and pitrou committed Oct 12, 2022
1 parent a02a336 commit a9d2504
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 4 deletions.
28 changes: 26 additions & 2 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1146,11 +1146,13 @@ int PlainDecoder<DType>::Decode(T* buffer, int max_values) {
return max_values;
}

class PlainBooleanDecoder : public DecoderImpl, virtual public TypedDecoder<BooleanType> {
class PlainBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
public:
explicit PlainBooleanDecoder(const ColumnDescriptor* descr);
void SetData(int num_values, const uint8_t* data, int len) override;

// Two flavors of bool decoding
int Decode(uint8_t* buffer, int max_values) override;
int Decode(bool* buffer, int max_values) override;
int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
int64_t valid_bits_offset,
Expand Down Expand Up @@ -1201,6 +1203,24 @@ inline int PlainBooleanDecoder::DecodeArrow(
ParquetException::NYI("dictionaries of BooleanType");
}

int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) {
max_values = std::min(max_values, num_values_);
bool val;
::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
for (int i = 0; i < max_values; ++i) {
if (!bit_reader_->GetValue(1, &val)) {
ParquetException::EofException();
}
if (val) {
bit_writer.Set();
}
bit_writer.Next();
}
bit_writer.Finish();
num_values_ -= max_values;
return max_values;
}

int PlainBooleanDecoder::Decode(bool* buffer, int max_values) {
max_values = std::min(max_values, num_values_);
if (bit_reader_->GetBatch(1, buffer, max_values) != max_values) {
Expand Down Expand Up @@ -2336,7 +2356,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
// ----------------------------------------------------------------------
// RLE_BOOLEAN_DECODER

class RleBooleanDecoder : public DecoderImpl, virtual public TypedDecoder<BooleanType> {
class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
public:
explicit RleBooleanDecoder(const ColumnDescriptor* descr)
: DecoderImpl(descr, Encoding::RLE) {}
Expand Down Expand Up @@ -2372,6 +2392,10 @@ class RleBooleanDecoder : public DecoderImpl, virtual public TypedDecoder<Boolea
return max_values;
}

int Decode(uint8_t* buffer, int max_values) override {
ParquetException::NYI("Decode(uint8_t*, int) for RleBooleanDecoder");
}

int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
int64_t valid_bits_offset,
typename EncodingTraits<BooleanType>::Accumulator* out) override {
Expand Down
16 changes: 15 additions & 1 deletion cpp/src/parquet/encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ using FLBAEncoder = TypedEncoder<FLBAType>;
template <typename DType>
class TypedDecoder;

using BooleanDecoder = TypedDecoder<BooleanType>;
class BooleanDecoder;
using Int32Decoder = TypedDecoder<Int32Type>;
using Int64Decoder = TypedDecoder<Int64Type>;
using Int96Decoder = TypedDecoder<Int96Type>;
Expand Down Expand Up @@ -394,6 +394,20 @@ class DictDecoder : virtual public TypedDecoder<DType> {
// ----------------------------------------------------------------------
// TypedEncoder specializations, traits, and factory functions

class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
public:
using TypedDecoder<BooleanType>::Decode;

/// \brief Decode and bit-pack values into a buffer
///
/// \param[in] buffer destination for decoded values
/// This buffer will contain bit-packed values.
/// \param[in] max_values max values to decode.
/// \return The number of values decoded. Should be identical to max_values except
/// at the end of the current data page.
virtual int Decode(uint8_t* buffer, int max_values) = 0;
};

class FLBADecoder : virtual public TypedDecoder<FLBAType> {
public:
using TypedDecoder<FLBAType>::DecodeSpaced;
Expand Down
34 changes: 33 additions & 1 deletion cpp/src/parquet/encoding_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ namespace parquet {

namespace test {

TEST(VectorBooleanTest, TestEncodeDecode) {
TEST(VectorBooleanTest, TestEncodeBoolDecode) {
// PARQUET-454
const int nvalues = 10000;
bool decode_buffer[nvalues] = {false};
Expand Down Expand Up @@ -82,6 +82,38 @@ TEST(VectorBooleanTest, TestEncodeDecode) {
}
}

TEST(VectorBooleanTest, TestEncodeIntDecode) {
// PARQUET-454
int nvalues = 10000;

int nbytes = static_cast<int>(bit_util::BytesForBits(nvalues));

std::vector<bool> draws;
::arrow::random_is_valid(nvalues, 0.5 /* null prob */, &draws, 0 /* seed */);

std::unique_ptr<BooleanEncoder> encoder =
MakeTypedEncoder<BooleanType>(Encoding::PLAIN);
encoder->Put(draws, nvalues);

std::unique_ptr<BooleanDecoder> decoder =
MakeTypedDecoder<BooleanType>(Encoding::PLAIN);

std::shared_ptr<Buffer> encode_buffer = encoder->FlushValues();
ASSERT_EQ(nbytes, encode_buffer->size());

std::vector<uint8_t> decode_buffer(nbytes);
const uint8_t* decode_data = &decode_buffer[0];

decoder->SetData(nvalues, encode_buffer->data(),
static_cast<int>(encode_buffer->size()));
int values_decoded = decoder->Decode(&decode_buffer[0], nvalues);
ASSERT_EQ(nvalues, values_decoded);

for (int i = 0; i < nvalues; ++i) {
ASSERT_EQ(draws[i], ::arrow::bit_util::GetBit(decode_data, i)) << i;
}
}

// ----------------------------------------------------------------------
// test data generation

Expand Down

0 comments on commit a9d2504

Please sign in to comment.