Skip to content

Commit

Permalink
ARROW-786: [Format] In-memory format for 128-bit Decimals, handling o…
Browse files Browse the repository at this point in the history
…f sign bit

* Reimplement Decimal128 types to use the Int128 type as the underlying integer
representation, adapted from the Apache ORC project's C++ in memory format.
This enables us to write integration tests and results in an in-memory
Decimal128 format that is compatible with the Java implementation
* Additionaly, this PR also fixes Decimal slice comparison and adds related
regression tests
* Follow-ups include ARROW-695 (C++ Decimal integration tests), ARROW-696 (JSON
read/write support for decimals) and ARROW-1238 (Java Decimal integration
tests).

Author: Phillip Cloud <cpcloud@gmail.com>

Closes #981 from cpcloud/decimal-rewrite and squashes the following commits:

53ce04b [Phillip Cloud] Formatting
fe13ef3 [Phillip Cloud] Remove redundant constructor
86db184 [Phillip Cloud] Subclass from FixedSizeBinaryArray for code reuse
535f9ff [Phillip Cloud] Use a macro for cases
1cc43ce [Phillip Cloud] Use CHAR_BIT
355fb24 [Phillip Cloud] Include the correct header for _BitScanReverse
b53d7cd [Phillip Cloud] Share comparison code
162eeeb [Phillip Cloud] BUG: Double export
b98c894 [Phillip Cloud] BUG: Export symbols
be220c8 [Phillip Cloud] Cast so we have enough space to contain the integer
5716010 [Phillip Cloud] Cast 18 to matching type size_t for msvc
8833904 [Phillip Cloud] Remove unnecessary args to sto* function calls
628ce85 [Phillip Cloud] Fix more docs
e4a1792 [Phillip Cloud] More const
8ecb315 [Phillip Cloud] Formatting
178d3f2 [Phillip Cloud] NOLINT for MSVC specific and necessary types
38c9b50 [Phillip Cloud] Fix doc style in int128.h and add const where possible
2930d7b [Phillip Cloud] Fix naming convention in decimal-test.cc
1eab5c4 [Phillip Cloud] Remove unnecessary header from CMakeLists.txt
22eda4b [Phillip Cloud] kMaximumPrecision
9af97d8 [Phillip Cloud] MSVC fix
349dc58 [Phillip Cloud] ARROW-786: [Format] In-memory format for 128-bit Decimals, handling of sign bit
  • Loading branch information
cpcloud authored and wesm committed Aug 24, 2017
1 parent b312697 commit 750b77d
Show file tree
Hide file tree
Showing 17 changed files with 846 additions and 390 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,7 @@ set(ARROW_SRCS
src/arrow/util/compression.cc
src/arrow/util/cpu-info.cc
src/arrow/util/decimal.cc
src/arrow/util/int128.cc
src/arrow/util/key_value_metadata.cc
)

Expand Down
92 changes: 52 additions & 40 deletions cpp/src/arrow/array-decimal-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,56 +28,66 @@ namespace decimal {
template <typename T>
class DecimalTestBase {
public:
virtual std::vector<uint8_t> data(const std::vector<T>& input,
size_t byte_width) const = 0;
DecimalTestBase() : pool_(default_memory_pool()) {}

void test(int precision, const std::vector<T>& draw,
const std::vector<uint8_t>& valid_bytes,
const std::vector<uint8_t>& sign_bitmap = {}, int64_t offset = 0) const {
auto type = std::make_shared<DecimalType>(precision, 4);
int byte_width = type->byte_width();
auto pool = default_memory_pool();
auto builder = std::make_shared<DecimalBuilder>(type, pool);
size_t null_count = 0;
virtual std::vector<uint8_t> MakeData(const std::vector<T>& input,
size_t byte_width) const = 0;

void InitBuilder(const std::shared_ptr<DecimalType>& type, const std::vector<T>& draw,
const std::vector<uint8_t>& valid_bytes, int byte_width,
std::shared_ptr<DecimalBuilder>* builder, size_t* null_count) const {
*builder = std::make_shared<DecimalBuilder>(type, pool_);

size_t size = draw.size();
ASSERT_OK(builder->Reserve(size));
ASSERT_OK((*builder)->Reserve(size));

for (size_t i = 0; i < size; ++i) {
if (valid_bytes[i]) {
ASSERT_OK(builder->Append(draw[i]));
ASSERT_OK((*builder)->Append(draw[i]));
} else {
ASSERT_OK(builder->AppendNull());
++null_count;
ASSERT_OK((*builder)->AppendNull());
++*null_count;
}
}
}

std::shared_ptr<Buffer> expected_sign_bitmap;
if (!sign_bitmap.empty()) {
ASSERT_OK(BitUtil::BytesToBits(sign_bitmap, &expected_sign_bitmap));
}
void TestCreate(int precision, const std::vector<T>& draw,
const std::vector<uint8_t>& valid_bytes, int64_t offset) const {
auto type = std::make_shared<DecimalType>(precision, 4);

auto raw_bytes = data(draw, byte_width);
std::shared_ptr<DecimalBuilder> builder;

size_t null_count = 0;

const size_t size = draw.size();
const int byte_width = type->byte_width();

InitBuilder(type, draw, valid_bytes, byte_width, &builder, &null_count);

auto raw_bytes = MakeData(draw, static_cast<size_t>(byte_width));
auto expected_data = std::make_shared<Buffer>(raw_bytes.data(), size * byte_width);
std::shared_ptr<Buffer> expected_null_bitmap;
ASSERT_OK(BitUtil::BytesToBits(valid_bytes, &expected_null_bitmap));

int64_t expected_null_count = test::null_count(valid_bytes);
auto expected =
std::make_shared<DecimalArray>(type, size, expected_data, expected_null_bitmap,
expected_null_count, offset, expected_sign_bitmap);
auto expected = std::make_shared<DecimalArray>(
type, size, expected_data, expected_null_bitmap, expected_null_count, 0);

std::shared_ptr<Array> out;
ASSERT_OK(builder->Finish(&out));
ASSERT_TRUE(out->Equals(*expected));
ASSERT_TRUE(out->Slice(offset)->Equals(
*expected->Slice(offset, expected->length() - offset)));
}

private:
MemoryPool* pool_;
};

template <typename T>
class DecimalTest : public DecimalTestBase<T> {
public:
std::vector<uint8_t> data(const std::vector<T>& input,
size_t byte_width) const override {
std::vector<uint8_t> MakeData(const std::vector<T>& input,
size_t byte_width) const override {
std::vector<uint8_t> result(input.size() * byte_width);
// TODO(phillipc): There's probably a better way to do this
constexpr static const size_t bytes_per_element = sizeof(T);
Expand All @@ -91,16 +101,15 @@ class DecimalTest : public DecimalTestBase<T> {
template <>
class DecimalTest<Decimal128> : public DecimalTestBase<Decimal128> {
public:
std::vector<uint8_t> data(const std::vector<Decimal128>& input,
size_t byte_width) const override {
std::vector<uint8_t> MakeData(const std::vector<Decimal128>& input,
size_t byte_width) const override {
std::vector<uint8_t> result;
result.reserve(input.size() * byte_width);
constexpr static const size_t bytes_per_element = 16;
for (size_t i = 0; i < input.size(); ++i) {
uint8_t stack_bytes[bytes_per_element] = {0};
uint8_t* bytes = stack_bytes;
bool is_negative;
ToBytes(input[i], &bytes, &is_negative);
ToBytes(input[i], &bytes);

for (size_t i = 0; i < bytes_per_element; ++i) {
result.push_back(bytes[i]);
Expand All @@ -124,40 +133,44 @@ TEST_P(Decimal32BuilderTest, NoNulls) {
std::vector<Decimal32> draw = {Decimal32(1), Decimal32(2), Decimal32(2389),
Decimal32(4), Decimal32(-12348)};
std::vector<uint8_t> valid_bytes = {true, true, true, true, true};
this->test(precision, draw, valid_bytes);
this->TestCreate(precision, draw, valid_bytes, 0);
this->TestCreate(precision, draw, valid_bytes, 2);
}

TEST_P(Decimal64BuilderTest, NoNulls) {
int precision = GetParam();
std::vector<Decimal64> draw = {Decimal64(1), Decimal64(2), Decimal64(2389),
Decimal64(4), Decimal64(-12348)};
std::vector<uint8_t> valid_bytes = {true, true, true, true, true};
this->test(precision, draw, valid_bytes);
this->TestCreate(precision, draw, valid_bytes, 0);
this->TestCreate(precision, draw, valid_bytes, 2);
}

TEST_P(Decimal128BuilderTest, NoNulls) {
int precision = GetParam();
std::vector<Decimal128> draw = {Decimal128(1), Decimal128(-2), Decimal128(2389),
Decimal128(4), Decimal128(-12348)};
std::vector<uint8_t> valid_bytes = {true, true, true, true, true};
std::vector<uint8_t> sign_bitmap = {false, true, false, false, true};
this->test(precision, draw, valid_bytes, sign_bitmap);
this->TestCreate(precision, draw, valid_bytes, 0);
this->TestCreate(precision, draw, valid_bytes, 2);
}

TEST_P(Decimal32BuilderTest, WithNulls) {
int precision = GetParam();
std::vector<Decimal32> draw = {Decimal32(1), Decimal32(2), Decimal32(-1), Decimal32(4),
Decimal32(-1)};
std::vector<uint8_t> valid_bytes = {true, true, false, true, false};
this->test(precision, draw, valid_bytes);
this->TestCreate(precision, draw, valid_bytes, 0);
this->TestCreate(precision, draw, valid_bytes, 2);
}

TEST_P(Decimal64BuilderTest, WithNulls) {
int precision = GetParam();
std::vector<Decimal64> draw = {Decimal64(-1), Decimal64(2), Decimal64(-1), Decimal64(4),
Decimal64(-1)};
std::vector<uint8_t> valid_bytes = {true, true, false, true, false};
this->test(precision, draw, valid_bytes);
this->TestCreate(precision, draw, valid_bytes, 0);
this->TestCreate(precision, draw, valid_bytes, 2);
}

TEST_P(Decimal128BuilderTest, WithNulls) {
Expand All @@ -173,9 +186,8 @@ TEST_P(Decimal128BuilderTest, WithNulls) {
Decimal128("-23049302932.235234")};
std::vector<uint8_t> valid_bytes = {true, true, false, true, false,
true, true, true, true};
std::vector<uint8_t> sign_bitmap = {false, false, false, false, false,
false, false, false, true};
this->test(precision, draw, valid_bytes, sign_bitmap);
this->TestCreate(precision, draw, valid_bytes, 0);
this->TestCreate(precision, draw, valid_bytes, 2);
}

INSTANTIATE_TEST_CASE_P(Decimal32BuilderTest, Decimal32BuilderTest,
Expand All @@ -185,8 +197,8 @@ INSTANTIATE_TEST_CASE_P(Decimal64BuilderTest, Decimal64BuilderTest,
::testing::Range(DecimalPrecision<int64_t>::minimum,
DecimalPrecision<int64_t>::maximum));
INSTANTIATE_TEST_CASE_P(Decimal128BuilderTest, Decimal128BuilderTest,
::testing::Range(DecimalPrecision<int128_t>::minimum,
DecimalPrecision<int128_t>::maximum));
::testing::Range(DecimalPrecision<Int128>::minimum,
DecimalPrecision<Int128>::maximum));

} // namespace decimal
} // namespace arrow
64 changes: 16 additions & 48 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ PrimitiveArray::PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t le

const uint8_t* PrimitiveArray::raw_values() const {
return raw_values_ +
offset() * static_cast<const FixedWidthType&>(*type()).bit_width() / 8;
offset() * static_cast<const FixedWidthType&>(*type()).bit_width() / CHAR_BIT;
}

template <typename T>
Expand Down Expand Up @@ -323,7 +323,6 @@ std::shared_ptr<Array> StringArray::Slice(int64_t offset, int64_t length) const

FixedSizeBinaryArray::FixedSizeBinaryArray(
const std::shared_ptr<internal::ArrayData>& data) {
DCHECK_EQ(data->type->id(), Type::FIXED_SIZE_BINARY);
SetData(data);
}

Expand All @@ -346,61 +345,30 @@ const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const {
// ----------------------------------------------------------------------
// Decimal

DecimalArray::DecimalArray(const std::shared_ptr<internal::ArrayData>& data) {
DecimalArray::DecimalArray(const std::shared_ptr<internal::ArrayData>& data)
: FixedSizeBinaryArray(data) {
DCHECK_EQ(data->type->id(), Type::DECIMAL);
SetData(data);
}

void DecimalArray::SetData(const std::shared_ptr<ArrayData>& data) {
auto fixed_size_data = data->buffers[1];
auto sign_bitmap = data->buffers[2];
this->Array::SetData(data);

raw_values_ = fixed_size_data != nullptr ? fixed_size_data->data() : nullptr;
sign_bitmap_data_ = sign_bitmap != nullptr ? sign_bitmap->data() : nullptr;
}

DecimalArray::DecimalArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset, const std::shared_ptr<Buffer>& sign_bitmap) {
BufferVector buffers = {null_bitmap, data, sign_bitmap};
SetData(
std::make_shared<ArrayData>(type, length, std::move(buffers), null_count, offset));
}

bool DecimalArray::IsNegative(int64_t i) const {
return sign_bitmap_data_ != nullptr ? BitUtil::GetBit(sign_bitmap_data_, i) : false;
}

const uint8_t* DecimalArray::GetValue(int64_t i) const {
return raw_values_ + (i + data_->offset) * byte_width();
}
#define DECIMAL_TO_STRING_CASE(bits, bytes, precision, scale) \
case bits: { \
decimal::Decimal##bits value; \
decimal::FromBytes((bytes), &value); \
return decimal::ToString(value, (precision), (scale)); \
}

std::string DecimalArray::FormatValue(int64_t i) const {
const auto& type_ = static_cast<const DecimalType&>(*type());
const int precision = type_.precision();
const int scale = type_.scale();
const int byte_width = type_.byte_width();
const uint8_t* bytes = raw_values_ + (i + data_->offset) * byte_width;
switch (byte_width) {
case 4: {
decimal::Decimal32 value;
decimal::FromBytes(bytes, &value);
return decimal::ToString(value, precision, scale);
}
case 8: {
decimal::Decimal64 value;
decimal::FromBytes(bytes, &value);
return decimal::ToString(value, precision, scale);
}
case 16: {
decimal::Decimal128 value;
decimal::FromBytes(bytes, IsNegative(i), &value);
return decimal::ToString(value, precision, scale);
}
const int bit_width = type_.bit_width();
const uint8_t* bytes = GetValue(i);
switch (bit_width) {
DECIMAL_TO_STRING_CASE(32, bytes, precision, scale)
DECIMAL_TO_STRING_CASE(64, bytes, precision, scale)
DECIMAL_TO_STRING_CASE(128, bytes, precision, scale)
default: {
DCHECK(false) << "Invalid byte width: " << byte_width;
DCHECK(false) << "Invalid bit width: " << bit_width;
return "";
}
}
Expand Down
37 changes: 4 additions & 33 deletions cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,6 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {

int32_t byte_width() const { return byte_width_; }

const uint8_t* raw_values() const { return raw_values_ + byte_width_ * data_->offset; }

std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const override;

protected:
Expand All @@ -536,45 +534,18 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {

// ----------------------------------------------------------------------
// DecimalArray
class ARROW_EXPORT DecimalArray : public FlatArray {
class ARROW_EXPORT DecimalArray : public FixedSizeBinaryArray {
public:
using TypeClass = Type;
using TypeClass = DecimalType;

using FixedSizeBinaryArray::FixedSizeBinaryArray;

/// \brief Construct DecimalArray from internal::ArrayData instance
explicit DecimalArray(const std::shared_ptr<internal::ArrayData>& data);

DecimalArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = nullptr,
int64_t null_count = 0, int64_t offset = 0,
const std::shared_ptr<Buffer>& sign_bitmap = nullptr);

bool IsNegative(int64_t i) const;

const uint8_t* GetValue(int64_t i) const;

std::string FormatValue(int64_t i) const;

std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const override;

/// \brief The main decimal data
/// For 32/64-bit decimal this is everything
std::shared_ptr<Buffer> values() const { return data_->buffers[1]; }

/// Only needed for 128 bit Decimals
std::shared_ptr<Buffer> sign_bitmap() const { return data_->buffers[2]; }

int32_t byte_width() const {
return static_cast<const DecimalType&>(*type()).byte_width();
}

/// \brief Return pointer to value data, accounting for any offset
const uint8_t* raw_values() const { return raw_values_ + byte_width() * data_->offset; }

private:
void SetData(const std::shared_ptr<internal::ArrayData>& data);
const uint8_t* raw_values_;
const uint8_t* sign_bitmap_data_;
};

// ----------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 750b77d

Please sign in to comment.