Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds two implementations of a BitRunReader, which returns set/not-set
and number of bits in a row. Adds benchmarks comparing the two implementations under different distributions. - Makes use of Adds the BitRunReader for use in Parquet Writing - Refactors GetBatchedSpaced and GetBatchedSpacedWithDict: Use a single templated method that adds a template parameter that the code can share. Does all checking for out of bounds indices in one go instead of on each pass through th literal (this is a slight behavior change as the index returned will be different). Makes use of the BitRunReader. With exactly alternating bits this shows a big performance drop, but is generally positive across any random and/or skewered nullability. fix type cast to make appveyor happy add predict false one more cast for windows remove redundant using try to fix builds address some comments inline all methods remove InvertRemainingBits and use LeastSignificantBitMask (renamed from PartialWordMask) remove LoadInitialWord, fix compile error fix lint Pre-rebase work iwyu Fix MSVC warning
- Loading branch information
1 parent
7038533
commit b104766
Showing
16 changed files
with
665 additions
and
174 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/util/bit_run_reader.h" | ||
|
||
#include <cstdint> | ||
|
||
#include "arrow/util/bit_util.h" | ||
|
||
namespace arrow { | ||
namespace internal { | ||
|
||
BitRunReader::BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length) | ||
: bitmap_(bitmap + (start_offset / 8)), | ||
position_(start_offset % 8), | ||
length_(position_ + length) { | ||
if (ARROW_PREDICT_FALSE(length == 0)) { | ||
word_ = 0; | ||
return; | ||
} | ||
|
||
//// On the initial load if there is an offset we need to account for this when | ||
// loading bytes. Every other call to LoadWord() should only occur when | ||
// position_ is a multiple of 64. | ||
current_run_bit_set_ = !BitUtil::GetBit(bitmap, start_offset); | ||
int64_t shift_offset = position_ % 8; | ||
int64_t bits_remaining = (length_ - position_) + shift_offset; | ||
bits_remaining += (bits_remaining % 8) == 0 && shift_offset > 0; | ||
|
||
LoadWord(bits_remaining); | ||
|
||
// Prepare for inversion in NextRun. | ||
// Clear out any preceding bits. | ||
word_ = word_ & ~BitUtil::LeastSignficantBitMask(position_); | ||
} | ||
|
||
} // namespace internal | ||
} // namespace arrow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#pragma once | ||
|
||
#include <cstdint> | ||
#include <cstring> | ||
#include <string> | ||
|
||
#include "arrow/util/bit_util.h" | ||
#include "arrow/util/bitmap_reader.h" | ||
#include "arrow/util/macros.h" | ||
#include "arrow/util/visibility.h" | ||
|
||
namespace arrow { | ||
namespace internal { | ||
|
||
struct BitRun { | ||
int64_t length; | ||
// Whether bits are set at this point. | ||
bool set; | ||
|
||
std::string ToString() const { | ||
return std::string("{Length: ") + std::to_string(length) + | ||
", set=" + std::to_string(set) + "}"; | ||
} | ||
}; | ||
|
||
static inline bool operator==(const BitRun& lhs, const BitRun& rhs) { | ||
return lhs.length == rhs.length && lhs.set == rhs.set; | ||
} | ||
|
||
class BitRunReaderScalar { | ||
public: | ||
BitRunReaderScalar(const uint8_t* bitmap, int64_t start_offset, int64_t length) | ||
: reader_(bitmap, start_offset, length) {} | ||
|
||
BitRun NextRun() { | ||
BitRun rl = {/*length=*/0, reader_.IsSet()}; | ||
// Advance while the values are equal and not at the end of list. | ||
while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) { | ||
rl.length++; | ||
reader_.Next(); | ||
} | ||
return rl; | ||
} | ||
|
||
private: | ||
BitmapReader reader_; | ||
}; | ||
|
||
#if defined(ARROW_LITTLE_ENDIAN) | ||
/// A convenience class for counting the number of continguous set/unset bits | ||
/// in a bitmap. | ||
class ARROW_EXPORT BitRunReader { | ||
public: | ||
/// \brief Constructs new BitRunReader. | ||
/// | ||
/// \param[in] bitmap source data | ||
/// \param[in] start_offset bit offset into the source data | ||
/// \param[in] length number of bits to copy | ||
BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length); | ||
|
||
/// Returns a new BitRun containing the number of contiguous | ||
/// bits with the same value. length == 0 indicates the | ||
/// end of the bitmap. | ||
BitRun NextRun() { | ||
if (ARROW_PREDICT_FALSE(position_ >= length_)) { | ||
return {/*length=*/0, false}; | ||
} | ||
// This implementation relies on a efficient implementations of | ||
// CountTrailingZeros and assumes that runs are more often then | ||
// not. The logic is to incrementally find the next bit change | ||
// from the current position. This is done by zeroing all | ||
// bits in word_ up to position_ and using the TrailingZeroCount | ||
// to find the index of the next set bit. | ||
|
||
// The runs alternate on each call, so flip the bit. | ||
current_run_bit_set_ = !current_run_bit_set_; | ||
|
||
int64_t start_position = position_; | ||
int64_t start_bit_offset = start_position & 63; | ||
// Invert the word for proper use of CountTrailingZeros and | ||
// clear bits so CountTrailingZeros can do it magic. | ||
word_ = ~word_ & ~BitUtil::LeastSignficantBitMask(start_bit_offset); | ||
|
||
// Go forward until the next change from unset to set. | ||
int64_t new_bits = BitUtil::CountTrailingZeros(word_) - start_bit_offset; | ||
position_ += new_bits; | ||
|
||
if (ARROW_PREDICT_FALSE(BitUtil::IsMultipleOf64(position_)) && | ||
ARROW_PREDICT_TRUE(position_ < length_)) { | ||
// Continue extending position while we can advance an entire word. | ||
// (updates position_ accordingly). | ||
AdvanceUntilChange(); | ||
} | ||
|
||
return {/*length=*/position_ - start_position, current_run_bit_set_}; | ||
} | ||
|
||
private: | ||
void AdvanceUntilChange() { | ||
int64_t new_bits = 0; | ||
do { | ||
// Advance the position of the bitmap for loading. | ||
bitmap_ += sizeof(uint64_t); | ||
LoadNextWord(); | ||
new_bits = BitUtil::CountTrailingZeros(word_); | ||
// Continue calculating run length. | ||
position_ += new_bits; | ||
} while (ARROW_PREDICT_FALSE(BitUtil::IsMultipleOf64(position_)) && | ||
ARROW_PREDICT_TRUE(position_ < length_) && new_bits > 0); | ||
} | ||
|
||
void LoadNextWord() { return LoadWord(length_ - position_); } | ||
|
||
// Helper method for Loading the next word. | ||
void LoadWord(int64_t bits_remaining) { | ||
word_ = 0; | ||
// we need at least an extra byte in this case. | ||
if (ARROW_PREDICT_TRUE(bits_remaining >= 64)) { | ||
std::memcpy(&word_, bitmap_, 8); | ||
} else { | ||
int64_t bytes_to_load = BitUtil::BytesForBits(bits_remaining); | ||
auto word_ptr = reinterpret_cast<uint8_t*>(&word_); | ||
std::memcpy(word_ptr, bitmap_, bytes_to_load); | ||
// Ensure stoppage at last bit in bitmap by reversing the next higher | ||
// order bit. | ||
BitUtil::SetBitTo(word_ptr, bits_remaining, | ||
!BitUtil::GetBit(word_ptr, bits_remaining - 1)); | ||
} | ||
|
||
// Two cases: | ||
// 1. For unset, CountTrailingZeros works natually so we don't | ||
// invert the word. | ||
// 2. Otherwise invert so we can use CountTrailingZeros. | ||
if (current_run_bit_set_) { | ||
word_ = ~word_; | ||
} | ||
} | ||
const uint8_t* bitmap_; | ||
int64_t position_; | ||
int64_t length_; | ||
uint64_t word_; | ||
bool current_run_bit_set_; | ||
}; | ||
#else | ||
using BitRunReader = BitRunReaderScalar; | ||
#endif | ||
|
||
} // namespace internal | ||
} // namespace arrow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.