Skip to content

Commit

Permalink
ARROW-1607: [C++] Implement DictionaryBuilder for Decimals
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud committed Sep 27, 2017
1 parent 808a143 commit d925a95
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 16 deletions.
92 changes: 89 additions & 3 deletions cpp/src/arrow/array-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1200,7 +1200,7 @@ class TestFWBinaryArray : public ::testing::Test {
};

TEST_F(TestFWBinaryArray, Builder) {
const int32_t byte_width = 10;
constexpr int32_t byte_width = 10;
int64_t length = 4096;

int64_t nbytes = length * byte_width;
Expand All @@ -1215,8 +1215,7 @@ TEST_F(TestFWBinaryArray, Builder) {

std::shared_ptr<Array> result;

auto CheckResult = [this, &length, &is_valid, &raw_data,
&byte_width](const Array& result) {
auto CheckResult = [&length, &is_valid, &raw_data, byte_width](const Array& result) {
// Verify output
const auto& fw_result = static_cast<const FixedSizeBinaryArray&>(result);

Expand Down Expand Up @@ -1847,6 +1846,93 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) {
ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array));
}

TEST(TestDecimalDictionaryBuilder, Basic) {
// Build the dictionary Array
const auto& decimal_type = arrow::decimal(2, 0);
DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());

// Test data
std::vector<Decimal128> test{12, 12, 11, 12};
for (const auto& value : test) {
ASSERT_OK(builder.Append(value.ToBytes().data()));
}

std::shared_ptr<Array> result;
ASSERT_OK(builder.Finish(&result));

// Build expected data
FixedSizeBinaryBuilder decimal_builder(decimal_type);
ASSERT_OK(decimal_builder.Append(Decimal128(12).ToBytes()));
ASSERT_OK(decimal_builder.Append(Decimal128(11).ToBytes()));

std::shared_ptr<Array> decimal_array;
ASSERT_OK(decimal_builder.Finish(&decimal_array));
auto dtype = arrow::dictionary(int8(), decimal_array);

Int8Builder int_builder;
ASSERT_OK(int_builder.Append({0, 0, 1, 0}));
std::shared_ptr<Array> int_array;
ASSERT_OK(int_builder.Finish(&int_array));

DictionaryArray expected(dtype, int_array);
ASSERT_TRUE(expected.Equals(result));
}

TEST(TestDecimalDictionaryBuilder, DoubleTableSize) {
const auto& decimal_type = arrow::decimal(21, 0);

// Build the dictionary Array
DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());

// Build expected data
FixedSizeBinaryBuilder fsb_builder(decimal_type);
Int16Builder int_builder;

// Fill with 1024 different values
for (int64_t i = 0; i < 1024; i++) {
const uint8_t bytes[] = {0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
12,
12,
static_cast<uint8_t>(i / 128),
static_cast<uint8_t>(i % 128)};
ASSERT_OK(builder.Append(bytes));
ASSERT_OK(fsb_builder.Append(bytes));
ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
}
// Fill with an already existing value
const uint8_t known_value[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1};
for (int64_t i = 0; i < 1024; i++) {
ASSERT_OK(builder.Append(known_value));
ASSERT_OK(int_builder.Append(1));
}

// Finalize result
std::shared_ptr<Array> result;
ASSERT_OK(builder.Finish(&result));

// Finalize expected data
std::shared_ptr<Array> fsb_array;
ASSERT_OK(fsb_builder.Finish(&fsb_array));

auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array);
std::shared_ptr<Array> int_array;
ASSERT_OK(int_builder.Finish(&int_array));

DictionaryArray expected(dtype, int_array);
ASSERT_TRUE(expected.Equals(result));
}

// ----------------------------------------------------------------------
// List tests

Expand Down
11 changes: 6 additions & 5 deletions cpp/src/arrow/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -831,11 +831,11 @@ DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
hash_table_(new PoolBuffer(pool)),
hash_slots_(nullptr),
dict_builder_(type, pool),
values_builder_(pool) {
values_builder_(pool),
byte_width_(static_cast<const FixedSizeBinaryType&>(*type).byte_width()) {
if (!::arrow::CpuInfo::initialized()) {
::arrow::CpuInfo::Init();
}
byte_width_ = static_cast<const FixedSizeBinaryType&>(*type).byte_width();
}

#ifndef ARROW_NO_DEPRECATED_API
Expand Down Expand Up @@ -921,7 +921,7 @@ Status DictionaryBuilder<T>::Append(const Scalar& value) {

template <typename T>
Status DictionaryBuilder<T>::AppendArray(const Array& array) {
const NumericArray<T>& numeric_array = static_cast<const NumericArray<T>&>(array);
const auto& numeric_array = static_cast<const NumericArray<T>&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
RETURN_NOT_OK(AppendNull());
Expand All @@ -938,8 +938,7 @@ Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
}

const FixedSizeBinaryArray& numeric_array =
static_cast<const FixedSizeBinaryArray&>(array);
const auto& numeric_array = static_cast<const FixedSizeBinaryArray&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
RETURN_NOT_OK(AppendNull());
Expand Down Expand Up @@ -1493,6 +1492,7 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder);
DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder);
DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
DICTIONARY_BUILDER_CASE(DECIMAL, DictionaryBuilder<FixedSizeBinaryType>);
default:
return Status::NotImplemented(type->ToString());
}
Expand Down Expand Up @@ -1528,6 +1528,7 @@ Status EncodeArrayToDictionary(const Array& input, MemoryPool* pool,
DICTIONARY_ARRAY_CASE(STRING, StringDictionaryBuilder);
DICTIONARY_ARRAY_CASE(BINARY, BinaryDictionaryBuilder);
DICTIONARY_ARRAY_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
DICTIONARY_ARRAY_CASE(DECIMAL, DictionaryBuilder<FixedSizeBinaryType>);
default:
std::stringstream ss;
ss << "Cannot encode array of type " << type->ToString();
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -937,8 +937,8 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {

class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {
public:
using DictionaryBuilder::DictionaryBuilder;
using DictionaryBuilder::Append;
using DictionaryBuilder::DictionaryBuilder;

Status Append(const uint8_t* value, int32_t length) {
return Append(internal::WrappedBinary(value, length));
Expand All @@ -958,8 +958,8 @@ class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType
/// \brief Dictionary array builder with convenience methods for strings
class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> {
public:
using DictionaryBuilder::DictionaryBuilder;
using DictionaryBuilder::Append;
using DictionaryBuilder::DictionaryBuilder;

Status Append(const uint8_t* value, int32_t length) {
return Append(internal::WrappedBinary(value, length));
Expand Down
6 changes: 5 additions & 1 deletion cpp/src/arrow/compute/cast.cc
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,10 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input
if (can_pre_allocate_values) {
std::shared_ptr<Buffer> out_data;

if (!(is_primitive(out->type->id()) || out->type->id() == Type::FIXED_SIZE_BINARY)) {
const Type::type type_id = out->type->id();

if (!(is_primitive(type_id) || type_id == Type::FIXED_SIZE_BINARY ||
type_id == Type::DECIMAL)) {
std::stringstream ss;
ss << "Cannot pre-allocate memory for type: " << out->type->ToString();
return Status::NotImplemented(ss.str());
Expand Down Expand Up @@ -614,6 +617,7 @@ class CastKernel : public UnaryKernel {
FN(IN_TYPE, FloatType); \
FN(IN_TYPE, DoubleType); \
FN(IN_TYPE, FixedSizeBinaryType); \
FN(IN_TYPE, DecimalType); \
FN(IN_TYPE, BinaryType); \
FN(IN_TYPE, StringType);

Expand Down
8 changes: 4 additions & 4 deletions cpp/src/arrow/io/io-file-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -393,9 +393,9 @@ TEST_F(TestReadableFile, ThreadSafety) {
ASSERT_OK(ReadableFile::Open(path_, &pool, &file_));

std::atomic<int> correct_count(0);
const int niter = 10000;
constexpr int niter = 10000;

auto ReadData = [&correct_count, &data, niter, this]() {
auto ReadData = [&correct_count, &data, this]() {
std::shared_ptr<Buffer> buffer;

for (int i = 0; i < niter; ++i) {
Expand Down Expand Up @@ -587,9 +587,9 @@ TEST_F(TestMemoryMappedFile, ThreadSafety) {
static_cast<int64_t>(data.size())));

std::atomic<int> correct_count(0);
const int niter = 10000;
constexpr int niter = 10000;

auto ReadData = [&correct_count, &data, niter, &file]() {
auto ReadData = [&correct_count, &data, &file, niter]() {
std::shared_ptr<Buffer> buffer;

for (int i = 0; i < niter; ++i) {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/io/io-hdfs-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ TYPED_TEST(TestHadoopFileSystem, ThreadSafety) {
ASSERT_OK(this->client_->OpenReadable(src_path, &file));

std::atomic<int> correct_count(0);
const int niter = 1000;
constexpr int niter = 1000;

auto ReadData = [&file, &correct_count, &data, niter]() {
for (int i = 0; i < niter; ++i) {
Expand Down

0 comments on commit d925a95

Please sign in to comment.