Skip to content

Commit

Permalink
GH-36297: [C++][Parquet] Benchmark for non-binary dict encoding (#36298)
Browse files Browse the repository at this point in the history
### Rationale for this change

Add benchmark for non-binary dict encoding

### What changes are included in this PR?

Add benchmark `BM_DictEncodingInt64`

### Are these changes tested?

no need

### Are there any user-facing changes?

no

* Closes: #36297

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
mapleFU committed Jun 27, 2023
1 parent de4936d commit 4198aac
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 7 deletions.
2 changes: 1 addition & 1 deletion cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());

for (int32_t index : buffered_indices_) {
if (!encoder.Put(index)) return -1;
if (ARROW_PREDICT_FALSE(!encoder.Put(index))) return -1;
}
encoder.Flush();

Expand Down
55 changes: 49 additions & 6 deletions cpp/src/parquet/encoding_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,29 @@ static void BM_RleDecodingSpacedBoolean(benchmark::State& state) {
BENCHMARK(BM_RleDecodingSpacedBoolean)->Apply(BM_SpacedArgs);

template <typename Type>
static void DecodeDict(std::vector<typename Type::c_type>& values,
static void EncodeDict(const std::vector<typename Type::c_type>& values,
benchmark::State& state) {
using T = typename Type::c_type;
int num_values = static_cast<int>(values.size());

MemoryPool* allocator = default_memory_pool();
std::shared_ptr<ColumnDescriptor> descr = Int64Schema(Repetition::REQUIRED);

auto base_encoder = MakeEncoder(Type::type_num, Encoding::RLE_DICTIONARY,
/*use_dictionary=*/true, descr.get(), allocator);
auto encoder =
dynamic_cast<typename EncodingTraits<Type>::Encoder*>(base_encoder.get());
for (auto _ : state) {
encoder->Put(values.data(), num_values);
encoder->FlushValues();
}

state.SetBytesProcessed(state.iterations() * num_values * sizeof(T));
state.SetItemsProcessed(state.iterations() * num_values);
}

template <typename Type>
static void DecodeDict(const std::vector<typename Type::c_type>& values,
benchmark::State& state) {
typedef typename Type::c_type T;
int num_values = static_cast<int>(values.size());
Expand All @@ -810,6 +832,7 @@ static void DecodeDict(std::vector<typename Type::c_type>& values,

PARQUET_THROW_NOT_OK(indices->Resize(actual_bytes));

std::vector<T> decoded_values(num_values);
for (auto _ : state) {
auto dict_decoder = MakeTypedDecoder<Type>(Encoding::PLAIN, descr.get());
dict_decoder->SetData(dict_traits->num_entries(), dict_buffer->data(),
Expand All @@ -818,10 +841,11 @@ static void DecodeDict(std::vector<typename Type::c_type>& values,
auto decoder = MakeDictDecoder<Type>(descr.get());
decoder->SetDict(dict_decoder.get());
decoder->SetData(num_values, indices->data(), static_cast<int>(indices->size()));
decoder->Decode(values.data(), num_values);
decoder->Decode(decoded_values.data(), num_values);
}

state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(T));
state.SetBytesProcessed(state.iterations() * num_values * sizeof(T));
state.SetItemsProcessed(state.iterations() * num_values);
}

static void BM_DictDecodingInt64_repeats(benchmark::State& state) {
Expand All @@ -834,19 +858,38 @@ static void BM_DictDecodingInt64_repeats(benchmark::State& state) {

BENCHMARK(BM_DictDecodingInt64_repeats)->Range(MIN_RANGE, MAX_RANGE);

static void BM_DictEncodingInt64_repeats(benchmark::State& state) {
typedef Int64Type Type;
typedef typename Type::c_type T;

std::vector<T> values(state.range(0), 64);
EncodeDict<Type>(values, state);
}

BENCHMARK(BM_DictEncodingInt64_repeats)->Range(MIN_RANGE, MAX_RANGE);

static void BM_DictDecodingInt64_literals(benchmark::State& state) {
typedef Int64Type Type;
typedef typename Type::c_type T;

std::vector<T> values(state.range(0));
for (size_t i = 0; i < values.size(); ++i) {
values[i] = i;
}
std::iota(values.begin(), values.end(), 0);
DecodeDict<Type>(values, state);
}

BENCHMARK(BM_DictDecodingInt64_literals)->Range(MIN_RANGE, MAX_RANGE);

static void BM_DictEncodingInt64_literals(benchmark::State& state) {
using Type = Int64Type;
using T = typename Type::c_type;

std::vector<T> values(state.range(0));
std::iota(values.begin(), values.end(), 0);
EncodeDict<Type>(values, state);
}

BENCHMARK(BM_DictEncodingInt64_literals)->Range(MIN_RANGE, MAX_RANGE);

static void BM_DictDecodingByteArray(benchmark::State& state) {
::arrow::random::RandomArrayGenerator rag(0);
// Using arrow generator to generate random data.
Expand Down

0 comments on commit 4198aac

Please sign in to comment.