Skip to content

Commit

Permalink
ARROW-261: Refactor String/Binary code paths to reflect unnested (non…
Browse files Browse the repository at this point in the history
…-list-based) structure

Per discussions on the mailing list. This should in theory match the Java implementation.

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes #176 from wesm/ARROW-261 and squashes the following commits:

dca39ce [Wes McKinney] Make binary/string constants static to avoid memory-access-related segfaults in third party libraries
1e65b01 [Wes McKinney] Deprecate pyarrow::Status in favor of just arrow::Status. Conform pyarrow use of ArrayBuilder::Finish
9a1f77e [Wes McKinney] Add license header to index.rst
bd70cab [Wes McKinney] Complete refactoring, fix up IPC tests for flattened string/binary buffer/metadata layout
ae64f2e [Wes McKinney] Refactoring to reflect collaprsed list-like structure of Binary and String types. Not yet complete
  • Loading branch information
wesm committed Oct 17, 2016
1 parent 8e8b17f commit 732a205
Show file tree
Hide file tree
Showing 43 changed files with 484 additions and 768 deletions.
1 change: 0 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,6 @@ set(ARROW_SRCS

src/arrow/types/construct.cc
src/arrow/types/decimal.cc
src/arrow/types/json.cc
src/arrow/types/list.cc
src/arrow/types/primitive.cc
src/arrow/types/string.cc
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class ARROW_EXPORT ArrayBuilder {

// Creates new array object to hold the contents of the builder and transfers
// ownership of the data. This resets all variables on the builder.
virtual std::shared_ptr<Array> Finish() = 0;
virtual Status Finish(std::shared_ptr<Array>* out) = 0;

const std::shared_ptr<DataType>& type() const { return type_; }

Expand Down
47 changes: 23 additions & 24 deletions cpp/src/arrow/ipc/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,22 +78,6 @@ static bool IsPrimitive(const DataType* type) {
}
}

static bool IsListType(const DataType* type) {
DCHECK(type != nullptr);
switch (type->type) {
// TODO(emkornfield) grouping like this are used in a few places in the
// code consider using pattern like:
// http://stackoverflow.com/questions/26784685/c-macro-for-calling-function-based-on-enum-type
//
case Type::BINARY:
case Type::LIST:
case Type::STRING:
return true;
default:
return false;
}
}

// ----------------------------------------------------------------------
// Record batch write path

Expand All @@ -115,7 +99,11 @@ Status VisitArray(const Array* arr, std::vector<flatbuf::FieldNode>* field_nodes
if (IsPrimitive(arr_type)) {
const auto prim_arr = static_cast<const PrimitiveArray*>(arr);
buffers->push_back(prim_arr->data());
} else if (IsListType(arr_type)) {
} else if (arr->type_enum() == Type::STRING || arr->type_enum() == Type::BINARY) {
const auto binary_arr = static_cast<const BinaryArray*>(arr);
buffers->push_back(binary_arr->offsets());
buffers->push_back(binary_arr->data());
} else if (arr->type_enum() == Type::LIST) {
const auto list_arr = static_cast<const ListArray*>(arr);
buffers->push_back(list_arr->offset_buffer());
RETURN_NOT_OK(VisitArray(
Expand Down Expand Up @@ -331,9 +319,21 @@ class RecordBatchReader::RecordBatchReaderImpl {
}
return MakePrimitiveArray(
type, field_meta.length, data, field_meta.null_count, null_bitmap, out);
}
} else if (type->type == Type::STRING || type->type == Type::BINARY) {
std::shared_ptr<Buffer> offsets;
std::shared_ptr<Buffer> values;
RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets));
RETURN_NOT_OK(GetBuffer(buffer_index_++, &values));

if (IsListType(type.get())) {
if (type->type == Type::STRING) {
*out = std::make_shared<StringArray>(
field_meta.length, offsets, values, field_meta.null_count, null_bitmap);
} else {
*out = std::make_shared<BinaryArray>(
field_meta.length, offsets, values, field_meta.null_count, null_bitmap);
}
return Status::OK();
} else if (type->type == Type::LIST) {
std::shared_ptr<Buffer> offsets;
RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets));
const int num_children = type->num_children();
Expand All @@ -346,11 +346,10 @@ class RecordBatchReader::RecordBatchReaderImpl {
std::shared_ptr<Array> values_array;
RETURN_NOT_OK(
NextArray(type->child(0).get(), max_recursion_depth - 1, &values_array));
return MakeListArray(type, field_meta.length, offsets, values_array,
field_meta.null_count, null_bitmap, out);
}

if (type->type == Type::STRUCT) {
*out = std::make_shared<ListArray>(type, field_meta.length, offsets, values_array,
field_meta.null_count, null_bitmap);
return Status::OK();
} else if (type->type == Type::STRUCT) {
const int num_children = type->num_children();
std::vector<ArrayPtr> fields;
fields.reserve(num_children);
Expand Down
19 changes: 8 additions & 11 deletions cpp/src/arrow/ipc/test-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ const auto kListInt32 = std::make_shared<ListType>(kInt32);
const auto kListListInt32 = std::make_shared<ListType>(kListInt32);

Status MakeRandomInt32Array(
int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* array) {
int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out) {
std::shared_ptr<PoolBuffer> data;
test::MakeRandomInt32PoolBuffer(length, pool, &data);
const auto kInt32 = std::make_shared<Int32Type>();
Expand All @@ -52,16 +52,14 @@ Status MakeRandomInt32Array(
test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes);
RETURN_NOT_OK(builder.Append(
reinterpret_cast<const int32_t*>(data->data()), length, valid_bytes->data()));
*array = builder.Finish();
return Status::OK();
return builder.Finish(out);
}
RETURN_NOT_OK(builder.Append(reinterpret_cast<const int32_t*>(data->data()), length));
*array = builder.Finish();
return Status::OK();
return builder.Finish(out);
}

Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_lists,
bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* array) {
bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out) {
// Create the null list values
std::vector<uint8_t> valid_lists(num_lists);
const double null_percent = include_nulls ? 0.1 : 0;
Expand Down Expand Up @@ -90,8 +88,8 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
}
ListBuilder builder(pool, child_array);
RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data()));
*array = builder.Finish();
return (*array)->Validate();
RETURN_NOT_OK(builder.Finish(out));
return (*out)->Validate();
}

typedef Status MakeRecordBatch(std::shared_ptr<RecordBatch>* out);
Expand All @@ -115,7 +113,7 @@ Status MakeIntRecordBatch(std::shared_ptr<RecordBatch>* out) {

template <class Builder, class RawType>
Status MakeRandomBinaryArray(
const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* array) {
const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* out) {
const std::vector<std::string> values = {
"", "", "abc", "123", "efg", "456!@#!@#", "12312"};
Builder builder(pool, type);
Expand All @@ -130,8 +128,7 @@ Status MakeRandomBinaryArray(
builder.Append(reinterpret_cast<const RawType*>(value.data()), value.size()));
}
}
*array = builder.Finish();
return Status::OK();
return builder.Finish(out);
}

Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out) {
Expand Down
20 changes: 4 additions & 16 deletions cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ struct ARROW_EXPORT DoubleType : public PrimitiveType<DoubleType> {
struct ARROW_EXPORT ListType : public DataType {
// List can contain any other logical value type
explicit ListType(const std::shared_ptr<DataType>& value_type)
: ListType(value_type, Type::LIST) {}
: ListType(std::make_shared<Field>("item", value_type)) {}

explicit ListType(const std::shared_ptr<Field>& value_field) : DataType(Type::LIST) {
children_ = {value_field};
Expand All @@ -255,26 +255,17 @@ struct ARROW_EXPORT ListType : public DataType {
static char const* name() { return "list"; }

std::string ToString() const override;

protected:
// Constructor for classes that are implemented as List Arrays.
ListType(const std::shared_ptr<DataType>& value_type, Type::type logical_type)
: DataType(logical_type) {
// TODO ARROW-187 this can technically fail, make a constructor method ?
children_ = {std::make_shared<Field>("item", value_type)};
}
};

// BinaryType type is reprsents lists of 1-byte values.
struct ARROW_EXPORT BinaryType : public ListType {
struct ARROW_EXPORT BinaryType : public DataType {
BinaryType() : BinaryType(Type::BINARY) {}
static char const* name() { return "binary"; }
std::string ToString() const override;

protected:
// Allow subclasses to change the logical type.
explicit BinaryType(Type::type logical_type)
: ListType(std::shared_ptr<DataType>(new UInt8Type()), logical_type) {}
explicit BinaryType(Type::type logical_type) : DataType(logical_type) {}
};

// UTF encoded strings
Expand All @@ -284,9 +275,6 @@ struct ARROW_EXPORT StringType : public BinaryType {
static char const* name() { return "string"; }

std::string ToString() const override;

protected:
explicit StringType(Type::type logical_type) : BinaryType(logical_type) {}
};

struct ARROW_EXPORT StructType : public DataType {
Expand All @@ -300,7 +288,7 @@ struct ARROW_EXPORT StructType : public DataType {

// These will be defined elsewhere
template <typename T>
struct type_traits {};
struct TypeTraits {};

} // namespace arrow

Expand Down
1 change: 0 additions & 1 deletion cpp/src/arrow/types/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ install(FILES
construct.h
datetime.h
decimal.h
json.h
list.h
primitive.h
string.h
Expand Down
28 changes: 0 additions & 28 deletions cpp/src/arrow/types/binary.h

This file was deleted.

31 changes: 3 additions & 28 deletions cpp/src/arrow/types/construct.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
BUILDER_CASE(DOUBLE, DoubleBuilder);

BUILDER_CASE(STRING, StringBuilder);
BUILDER_CASE(BINARY, BinaryBuilder);

case Type::LIST: {
std::shared_ptr<ArrayBuilder> value_builder;
Expand Down Expand Up @@ -105,10 +106,10 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length,
MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array);
MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array);
MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array);
MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array);
MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray);
MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray);
MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray);
MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array);
MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray);
MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray);
default:
return Status::NotImplemented(type->ToString());
Expand All @@ -120,30 +121,4 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length,
#endif
}

Status MakeListArray(const TypePtr& type, int32_t length,
const std::shared_ptr<Buffer>& offsets, const ArrayPtr& values, int32_t null_count,
const std::shared_ptr<Buffer>& null_bitmap, ArrayPtr* out) {
switch (type->type) {
case Type::BINARY:
out->reset(new BinaryArray(type, length, offsets, values, null_count, null_bitmap));
break;

case Type::LIST:
out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap));
break;

case Type::DECIMAL_TEXT:
case Type::STRING:
out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap));
break;
default:
return Status::NotImplemented(type->ToString());
}
#ifdef NDEBUG
return Status::OK();
#else
return (*out)->Validate();
#endif
}

} // namespace arrow
8 changes: 0 additions & 8 deletions cpp/src/arrow/types/construct.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,6 @@ Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr<DataType>& type,
int32_t length, const std::shared_ptr<Buffer>& data, int32_t null_count,
const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out);

// Create new list arrays for logical types that are backed by ListArrays (e.g. list of
// primitives and strings)
// TODO(emkornfield) split up string vs list?
Status ARROW_EXPORT MakeListArray(const std::shared_ptr<DataType>& type, int32_t length,
const std::shared_ptr<Buffer>& offests, const std::shared_ptr<Array>& values,
int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap,
std::shared_ptr<Array>* out);

} // namespace arrow

#endif // ARROW_BUILDER_H_
37 changes: 0 additions & 37 deletions cpp/src/arrow/types/json.cc

This file was deleted.

36 changes: 0 additions & 36 deletions cpp/src/arrow/types/json.h

This file was deleted.

0 comments on commit 732a205

Please sign in to comment.