Skip to content

Commit

Permalink
Make more use of the bitmaps
Browse files Browse the repository at this point in the history
  • Loading branch information
xhochy committed Jan 16, 2017
1 parent 685ad34 commit 3424ae3
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 17 deletions.
40 changes: 33 additions & 7 deletions src/parquet/arrow/reader.cc
Expand Up @@ -285,10 +285,18 @@ Status FlatColumnReader::Impl::ReadNullableFlatBatch(
values, &null_count, valid_bits_ptr_, valid_bits_idx_));

auto data_ptr = reinterpret_cast<ArrowCType*>(data_buffer_ptr_);
int byte_offset = valid_bits_idx_ / 8;
int bit_offset = valid_bits_idx_ % 8;
uint8_t bitset = valid_bits_ptr_[byte_offset];

for (int64_t i = 0; i < *levels_read; i++) {
// TODO: Replace with bitmap
if (def_levels[i] == descr_->max_definition_level()) {
data_ptr[valid_bits_idx_ + i] = values[i];
if (bitset & (1 << bit_offset)) { data_ptr[valid_bits_idx_ + i] = values[i]; }

bit_offset++;
if (bit_offset == 8) {
bit_offset = 0;
byte_offset++;
bitset = valid_bits_ptr_[byte_offset];
}
}
null_count_ += null_count;
Expand Down Expand Up @@ -329,11 +337,20 @@ Status FlatColumnReader::Impl::ReadNullableFlatBatch<::arrow::TimestampType, Int
values, &null_count, valid_bits_ptr_, valid_bits_idx_));

auto data_ptr = reinterpret_cast<int64_t*>(data_buffer_ptr_);
int byte_offset = valid_bits_idx_ / 8;
int bit_offset = valid_bits_idx_ % 8;
uint8_t bitset = valid_bits_ptr_[byte_offset];
for (int64_t i = 0; i < *levels_read; i++) {
// TODO: Use valid_bits_
if (def_levels[i] == descr_->max_definition_level()) {
if (bitset & (1 << bit_offset)) {
data_ptr[valid_bits_idx_ + i] = impala_timestamp_to_nanoseconds(values[i]);
}

bit_offset++;
if (bit_offset == 8) {
bit_offset = 0;
byte_offset++;
bitset = valid_bits_ptr_[byte_offset];
}
}
null_count_ += null_count;
valid_bits_idx_ += *levels_read;
Expand All @@ -350,11 +367,20 @@ Status FlatColumnReader::Impl::ReadNullableFlatBatch<::arrow::BooleanType, Boole
reader->ReadBatchSpaced(values_to_read, def_levels, nullptr,
values, &null_count, valid_bits_ptr_, valid_bits_idx_));

int byte_offset = valid_bits_idx_ / 8;
int bit_offset = valid_bits_idx_ % 8;
uint8_t bitset = valid_bits_ptr_[byte_offset];
for (int64_t i = 0; i < *levels_read; i++) {
// TODO: use bitmap
if (def_levels[i] == descr_->max_definition_level()) {
if (bitset & (1 << bit_offset)) {
if (values[i]) { ::arrow::BitUtil::SetBit(data_buffer_ptr_, valid_bits_idx_ + i); }
}

bit_offset++;
if (bit_offset == 8) {
bit_offset = 0;
byte_offset++;
bitset = valid_bits_ptr_[byte_offset];
}
}
valid_bits_idx_ += *levels_read;
null_count_ += null_count;
Expand Down
39 changes: 29 additions & 10 deletions src/parquet/column/reader.h
Expand Up @@ -25,8 +25,6 @@
#include <unordered_map>
#include <vector>

#include <arrow/util/bit-util.h>

#include "parquet/column/levels.h"
#include "parquet/column/page.h"
#include "parquet/encodings/decoder.h"
Expand Down Expand Up @@ -242,6 +240,33 @@ inline int64_t TypedColumnReader<DType>::ReadBatch(int batch_size, int16_t* def_
return total_values;
}

inline void DefinitionLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
int16_t max_definition_level, int* null_count, uint8_t* valid_bits,
int64_t valid_bits_offset) {
int byte_offset = valid_bits_offset / 8;
int bit_offset = valid_bits_offset % 8;
uint8_t bitset = valid_bits[byte_offset];

for (int i = 0; i < num_def_levels; ++i) {
if (def_levels[i] == max_definition_level) {
bitset |= (1 << bit_offset);
} else {
bitset &= ~(1 << bit_offset);
*null_count += 1;
}

bit_offset++;
if (bit_offset == 8) {
bit_offset = 0;
valid_bits[byte_offset] = bitset;
byte_offset++;
// TODO: Except for the last byte, this shouldn't be needed
bitset = valid_bits[byte_offset];
}
}
if (bit_offset != 0) { valid_bits[byte_offset] = bitset; }
}

template <typename DType>
inline int64_t TypedColumnReader<DType>::ReadBatchSpaced(int batch_size,
int16_t* def_levels, int16_t* rep_levels, T* values, int* null_count_out,
Expand Down Expand Up @@ -269,14 +294,8 @@ inline int64_t TypedColumnReader<DType>::ReadBatchSpaced(int batch_size,
// TODO: Move this into the DefinitionLevels reader
int null_count = 0;
int16_t max_definition_level = descr_->max_definition_level();
for (int i = 0; i < num_def_levels; ++i) {
if (def_levels[i] == max_definition_level) {
::arrow::BitUtil::SetBit(valid_bits, valid_bits_offset + i);
} else {
::arrow::BitUtil::ClearBit(valid_bits, valid_bits_offset + i);
++null_count;
}
}
DefinitionLevelsToBitmap(def_levels, num_def_levels, max_definition_level,
&null_count, valid_bits, valid_bits_offset);
*null_count_out = null_count;

total_values = ReadValuesSpaced(
Expand Down

0 comments on commit 3424ae3

Please sign in to comment.