ARROW-786: [Format] In-memory format for 128-bit Decimals, handling o…

…f sign bit * Reimplement Decimal128 types to use the Int128 type as the underlying integer representation, adapted from the Apache ORC project's C++ in memory format. This enables us to write integration tests and results in an in-memory Decimal128 format that is compatible with the Java implementation * Additionaly, this PR also fixes Decimal slice comparison and adds related regression tests * Follow-ups include ARROW-695 (C++ Decimal integration tests), ARROW-696 (JSON read/write support for decimals) and ARROW-1238 (Java Decimal integration tests). Author: Phillip Cloud <cpcloud@gmail.com> Closes #981 from cpcloud/decimal-rewrite and squashes the following commits: 53ce04b [Phillip Cloud] Formatting fe13ef3 [Phillip Cloud] Remove redundant constructor 86db184 [Phillip Cloud] Subclass from FixedSizeBinaryArray for code reuse 535f9ff [Phillip Cloud] Use a macro for cases 1cc43ce [Phillip Cloud] Use CHAR_BIT 355fb24 [Phillip Cloud] Include the correct header for _BitScanReverse b53d7cd [Phillip Cloud] Share comparison code 162eeeb [Phillip Cloud] BUG: Double export b98c894 [Phillip Cloud] BUG: Export symbols be220c8 [Phillip Cloud] Cast so we have enough space to contain the integer 5716010 [Phillip Cloud] Cast 18 to matching type size_t for msvc 8833904 [Phillip Cloud] Remove unnecessary args to sto* function calls 628ce85 [Phillip Cloud] Fix more docs e4a1792 [Phillip Cloud] More const 8ecb315 [Phillip Cloud] Formatting 178d3f2 [Phillip Cloud] NOLINT for MSVC specific and necessary types 38c9b50 [Phillip Cloud] Fix doc style in int128.h and add const where possible 2930d7b [Phillip Cloud] Fix naming convention in decimal-test.cc 1eab5c4 [Phillip Cloud] Remove unnecessary header from CMakeLists.txt 22eda4b [Phillip Cloud] kMaximumPrecision 9af97d8 [Phillip Cloud] MSVC fix 349dc58 [Phillip Cloud] ARROW-786: [Format] In-memory format for 128-bit Decimals, handling of sign bit
apache · Aug 24, 2017 · 750b77d · 750b77d
1 parent b312697
commit 750b77d
Show file tree

Hide file tree

Showing 17 changed files with 846 additions and 390 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -742,6 +742,7 @@ set(ARROW_SRCS
   src/arrow/util/compression.cc
   src/arrow/util/cpu-info.cc
   src/arrow/util/decimal.cc
+  src/arrow/util/int128.cc
   src/arrow/util/key_value_metadata.cc
 )
 

diff --git a/cpp/src/arrow/array-decimal-test.cc b/cpp/src/arrow/array-decimal-test.cc
@@ -28,56 +28,66 @@ namespace decimal {
 template <typename T>
 class DecimalTestBase {
  public:
-  virtual std::vector<uint8_t> data(const std::vector<T>& input,
-                                    size_t byte_width) const = 0;
+  DecimalTestBase() : pool_(default_memory_pool()) {}
 
-  void test(int precision, const std::vector<T>& draw,
-            const std::vector<uint8_t>& valid_bytes,
-            const std::vector<uint8_t>& sign_bitmap = {}, int64_t offset = 0) const {
-    auto type = std::make_shared<DecimalType>(precision, 4);
-    int byte_width = type->byte_width();
-    auto pool = default_memory_pool();
-    auto builder = std::make_shared<DecimalBuilder>(type, pool);
-    size_t null_count = 0;
+  virtual std::vector<uint8_t> MakeData(const std::vector<T>& input,
+                                        size_t byte_width) const = 0;
+
+  void InitBuilder(const std::shared_ptr<DecimalType>& type, const std::vector<T>& draw,
+                   const std::vector<uint8_t>& valid_bytes, int byte_width,
+                   std::shared_ptr<DecimalBuilder>* builder, size_t* null_count) const {
+    *builder = std::make_shared<DecimalBuilder>(type, pool_);
 
     size_t size = draw.size();
-    ASSERT_OK(builder->Reserve(size));
+    ASSERT_OK((*builder)->Reserve(size));
 
     for (size_t i = 0; i < size; ++i) {
       if (valid_bytes[i]) {
-        ASSERT_OK(builder->Append(draw[i]));
+        ASSERT_OK((*builder)->Append(draw[i]));
       } else {
-        ASSERT_OK(builder->AppendNull());
-        ++null_count;
+        ASSERT_OK((*builder)->AppendNull());
+        ++*null_count;
       }
     }
+  }
 
-    std::shared_ptr<Buffer> expected_sign_bitmap;
-    if (!sign_bitmap.empty()) {
-      ASSERT_OK(BitUtil::BytesToBits(sign_bitmap, &expected_sign_bitmap));
-    }
+  void TestCreate(int precision, const std::vector<T>& draw,
+                  const std::vector<uint8_t>& valid_bytes, int64_t offset) const {
+    auto type = std::make_shared<DecimalType>(precision, 4);
 
-    auto raw_bytes = data(draw, byte_width);
+    std::shared_ptr<DecimalBuilder> builder;
+
+    size_t null_count = 0;
+
+    const size_t size = draw.size();
+    const int byte_width = type->byte_width();
+
+    InitBuilder(type, draw, valid_bytes, byte_width, &builder, &null_count);
+
+    auto raw_bytes = MakeData(draw, static_cast<size_t>(byte_width));
     auto expected_data = std::make_shared<Buffer>(raw_bytes.data(), size * byte_width);
     std::shared_ptr<Buffer> expected_null_bitmap;
     ASSERT_OK(BitUtil::BytesToBits(valid_bytes, &expected_null_bitmap));
 
     int64_t expected_null_count = test::null_count(valid_bytes);
-    auto expected =
-        std::make_shared<DecimalArray>(type, size, expected_data, expected_null_bitmap,
-                                       expected_null_count, offset, expected_sign_bitmap);
+    auto expected = std::make_shared<DecimalArray>(
+        type, size, expected_data, expected_null_bitmap, expected_null_count, 0);
 
     std::shared_ptr<Array> out;
     ASSERT_OK(builder->Finish(&out));
-    ASSERT_TRUE(out->Equals(*expected));
+    ASSERT_TRUE(out->Slice(offset)->Equals(
+        *expected->Slice(offset, expected->length() - offset)));
   }
+
+ private:
+  MemoryPool* pool_;
 };
 
 template <typename T>
 class DecimalTest : public DecimalTestBase<T> {
  public:
-  std::vector<uint8_t> data(const std::vector<T>& input,
-                            size_t byte_width) const override {
+  std::vector<uint8_t> MakeData(const std::vector<T>& input,
+                                size_t byte_width) const override {
     std::vector<uint8_t> result(input.size() * byte_width);
     // TODO(phillipc): There's probably a better way to do this
     constexpr static const size_t bytes_per_element = sizeof(T);
@@ -91,16 +101,15 @@ class DecimalTest : public DecimalTestBase<T> {
 template <>
 class DecimalTest<Decimal128> : public DecimalTestBase<Decimal128> {
  public:
-  std::vector<uint8_t> data(const std::vector<Decimal128>& input,
-                            size_t byte_width) const override {
+  std::vector<uint8_t> MakeData(const std::vector<Decimal128>& input,
+                                size_t byte_width) const override {
     std::vector<uint8_t> result;
     result.reserve(input.size() * byte_width);
     constexpr static const size_t bytes_per_element = 16;
     for (size_t i = 0; i < input.size(); ++i) {
       uint8_t stack_bytes[bytes_per_element] = {0};
       uint8_t* bytes = stack_bytes;
-      bool is_negative;
-      ToBytes(input[i], &bytes, &is_negative);
+      ToBytes(input[i], &bytes);
 
       for (size_t i = 0; i < bytes_per_element; ++i) {
         result.push_back(bytes[i]);
@@ -124,40 +133,44 @@ TEST_P(Decimal32BuilderTest, NoNulls) {
   std::vector<Decimal32> draw = {Decimal32(1), Decimal32(2), Decimal32(2389),
                                  Decimal32(4), Decimal32(-12348)};
   std::vector<uint8_t> valid_bytes = {true, true, true, true, true};
-  this->test(precision, draw, valid_bytes);
+  this->TestCreate(precision, draw, valid_bytes, 0);
+  this->TestCreate(precision, draw, valid_bytes, 2);
 }
 
 TEST_P(Decimal64BuilderTest, NoNulls) {
   int precision = GetParam();
   std::vector<Decimal64> draw = {Decimal64(1), Decimal64(2), Decimal64(2389),
                                  Decimal64(4), Decimal64(-12348)};
   std::vector<uint8_t> valid_bytes = {true, true, true, true, true};
-  this->test(precision, draw, valid_bytes);
+  this->TestCreate(precision, draw, valid_bytes, 0);
+  this->TestCreate(precision, draw, valid_bytes, 2);
 }
 
 TEST_P(Decimal128BuilderTest, NoNulls) {
   int precision = GetParam();
   std::vector<Decimal128> draw = {Decimal128(1), Decimal128(-2), Decimal128(2389),
                                   Decimal128(4), Decimal128(-12348)};
   std::vector<uint8_t> valid_bytes = {true, true, true, true, true};
-  std::vector<uint8_t> sign_bitmap = {false, true, false, false, true};
-  this->test(precision, draw, valid_bytes, sign_bitmap);
+  this->TestCreate(precision, draw, valid_bytes, 0);
+  this->TestCreate(precision, draw, valid_bytes, 2);
 }
 
 TEST_P(Decimal32BuilderTest, WithNulls) {
   int precision = GetParam();
   std::vector<Decimal32> draw = {Decimal32(1), Decimal32(2), Decimal32(-1), Decimal32(4),
                                  Decimal32(-1)};
   std::vector<uint8_t> valid_bytes = {true, true, false, true, false};
-  this->test(precision, draw, valid_bytes);
+  this->TestCreate(precision, draw, valid_bytes, 0);
+  this->TestCreate(precision, draw, valid_bytes, 2);
 }
 
 TEST_P(Decimal64BuilderTest, WithNulls) {
   int precision = GetParam();
   std::vector<Decimal64> draw = {Decimal64(-1), Decimal64(2), Decimal64(-1), Decimal64(4),
                                  Decimal64(-1)};
   std::vector<uint8_t> valid_bytes = {true, true, false, true, false};
-  this->test(precision, draw, valid_bytes);
+  this->TestCreate(precision, draw, valid_bytes, 0);
+  this->TestCreate(precision, draw, valid_bytes, 2);
 }
 
 TEST_P(Decimal128BuilderTest, WithNulls) {
@@ -173,9 +186,8 @@ TEST_P(Decimal128BuilderTest, WithNulls) {
                                   Decimal128("-23049302932.235234")};
   std::vector<uint8_t> valid_bytes = {true, true, false, true, false,
                                       true, true, true,  true};
-  std::vector<uint8_t> sign_bitmap = {false, false, false, false, false,
-                                      false, false, false, true};
-  this->test(precision, draw, valid_bytes, sign_bitmap);
+  this->TestCreate(precision, draw, valid_bytes, 0);
+  this->TestCreate(precision, draw, valid_bytes, 2);
 }
 
 INSTANTIATE_TEST_CASE_P(Decimal32BuilderTest, Decimal32BuilderTest,
@@ -185,8 +197,8 @@ INSTANTIATE_TEST_CASE_P(Decimal64BuilderTest, Decimal64BuilderTest,
                         ::testing::Range(DecimalPrecision<int64_t>::minimum,
                                          DecimalPrecision<int64_t>::maximum));
 INSTANTIATE_TEST_CASE_P(Decimal128BuilderTest, Decimal128BuilderTest,
-                        ::testing::Range(DecimalPrecision<int128_t>::minimum,
-                                         DecimalPrecision<int128_t>::maximum));
+                        ::testing::Range(DecimalPrecision<Int128>::minimum,
+                                         DecimalPrecision<Int128>::maximum));
 
 }  // namespace decimal
 }  // namespace arrow
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
@@ -161,7 +161,7 @@ PrimitiveArray::PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t le
 
 const uint8_t* PrimitiveArray::raw_values() const {
   return raw_values_ +
-         offset() * static_cast<const FixedWidthType&>(*type()).bit_width() / 8;
+         offset() * static_cast<const FixedWidthType&>(*type()).bit_width() / CHAR_BIT;
 }
 
 template <typename T>
@@ -323,7 +323,6 @@ std::shared_ptr<Array> StringArray::Slice(int64_t offset, int64_t length) const
 
 FixedSizeBinaryArray::FixedSizeBinaryArray(
     const std::shared_ptr<internal::ArrayData>& data) {
-  DCHECK_EQ(data->type->id(), Type::FIXED_SIZE_BINARY);
   SetData(data);
 }
 
@@ -346,61 +345,30 @@ const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const {
 // ----------------------------------------------------------------------
 // Decimal
 
-DecimalArray::DecimalArray(const std::shared_ptr<internal::ArrayData>& data) {
+DecimalArray::DecimalArray(const std::shared_ptr<internal::ArrayData>& data)
+    : FixedSizeBinaryArray(data) {
   DCHECK_EQ(data->type->id(), Type::DECIMAL);
-  SetData(data);
-}
-
-void DecimalArray::SetData(const std::shared_ptr<ArrayData>& data) {
-  auto fixed_size_data = data->buffers[1];
-  auto sign_bitmap = data->buffers[2];
-  this->Array::SetData(data);
-
-  raw_values_ = fixed_size_data != nullptr ? fixed_size_data->data() : nullptr;
-  sign_bitmap_data_ = sign_bitmap != nullptr ? sign_bitmap->data() : nullptr;
 }
 
-DecimalArray::DecimalArray(const std::shared_ptr<DataType>& type, int64_t length,
-                           const std::shared_ptr<Buffer>& data,
-                           const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
-                           int64_t offset, const std::shared_ptr<Buffer>& sign_bitmap) {
-  BufferVector buffers = {null_bitmap, data, sign_bitmap};
-  SetData(
-      std::make_shared<ArrayData>(type, length, std::move(buffers), null_count, offset));
-}
-
-bool DecimalArray::IsNegative(int64_t i) const {
-  return sign_bitmap_data_ != nullptr ? BitUtil::GetBit(sign_bitmap_data_, i) : false;
-}
-
-const uint8_t* DecimalArray::GetValue(int64_t i) const {
-  return raw_values_ + (i + data_->offset) * byte_width();
-}
+#define DECIMAL_TO_STRING_CASE(bits, bytes, precision, scale) \
+  case bits: {                                                \
+    decimal::Decimal##bits value;                             \
+    decimal::FromBytes((bytes), &value);                      \
+    return decimal::ToString(value, (precision), (scale));    \
+  }
 
 std::string DecimalArray::FormatValue(int64_t i) const {
   const auto& type_ = static_cast<const DecimalType&>(*type());
   const int precision = type_.precision();
   const int scale = type_.scale();
-  const int byte_width = type_.byte_width();
-  const uint8_t* bytes = raw_values_ + (i + data_->offset) * byte_width;
-  switch (byte_width) {
-    case 4: {
-      decimal::Decimal32 value;
-      decimal::FromBytes(bytes, &value);
-      return decimal::ToString(value, precision, scale);
-    }
-    case 8: {
-      decimal::Decimal64 value;
-      decimal::FromBytes(bytes, &value);
-      return decimal::ToString(value, precision, scale);
-    }
-    case 16: {
-      decimal::Decimal128 value;
-      decimal::FromBytes(bytes, IsNegative(i), &value);
-      return decimal::ToString(value, precision, scale);
-    }
+  const int bit_width = type_.bit_width();
+  const uint8_t* bytes = GetValue(i);
+  switch (bit_width) {
+    DECIMAL_TO_STRING_CASE(32, bytes, precision, scale)
+    DECIMAL_TO_STRING_CASE(64, bytes, precision, scale)
+    DECIMAL_TO_STRING_CASE(128, bytes, precision, scale)
     default: {
-      DCHECK(false) << "Invalid byte width: " << byte_width;
+      DCHECK(false) << "Invalid bit width: " << bit_width;
       return "";
     }
   }

diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
@@ -521,8 +521,6 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
 
   int32_t byte_width() const { return byte_width_; }
 
-  const uint8_t* raw_values() const { return raw_values_ + byte_width_ * data_->offset; }
-
   std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const override;
 
  protected:
@@ -536,45 +534,18 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
 
 // ----------------------------------------------------------------------
 // DecimalArray
-class ARROW_EXPORT DecimalArray : public FlatArray {
+class ARROW_EXPORT DecimalArray : public FixedSizeBinaryArray {
  public:
-  using TypeClass = Type;
+  using TypeClass = DecimalType;
+
+  using FixedSizeBinaryArray::FixedSizeBinaryArray;
 
   /// \brief Construct DecimalArray from internal::ArrayData instance
   explicit DecimalArray(const std::shared_ptr<internal::ArrayData>& data);
 
-  DecimalArray(const std::shared_ptr<DataType>& type, int64_t length,
-               const std::shared_ptr<Buffer>& data,
-               const std::shared_ptr<Buffer>& null_bitmap = nullptr,
-               int64_t null_count = 0, int64_t offset = 0,
-               const std::shared_ptr<Buffer>& sign_bitmap = nullptr);
-
-  bool IsNegative(int64_t i) const;
-
-  const uint8_t* GetValue(int64_t i) const;
-
   std::string FormatValue(int64_t i) const;
 
   std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const override;
-
-  /// \brief The main decimal data
-  /// For 32/64-bit decimal this is everything
-  std::shared_ptr<Buffer> values() const { return data_->buffers[1]; }
-
-  /// Only needed for 128 bit Decimals
-  std::shared_ptr<Buffer> sign_bitmap() const { return data_->buffers[2]; }
-
-  int32_t byte_width() const {
-    return static_cast<const DecimalType&>(*type()).byte_width();
-  }
-
-  /// \brief Return pointer to value data, accounting for any offset
-  const uint8_t* raw_values() const { return raw_values_ + byte_width() * data_->offset; }
-
- private:
-  void SetData(const std::shared_ptr<internal::ArrayData>& data);
-  const uint8_t* raw_values_;
-  const uint8_t* sign_bitmap_data_;
 };
 
 // ----------------------------------------------------------------------