Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-5155: [GLib][Ruby] Add support for building union arrays from data type #4127

Closed
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
99 changes: 99 additions & 0 deletions c_glib/arrow-glib/composite-array.cpp
Expand Up @@ -366,6 +366,54 @@ garrow_sparse_union_array_new(GArrowInt8Array *type_ids,
}
}

/**
* garrow_sparse_union_array_new_data_type:
* @data_type: The data type for the sparse array.
* @type_ids: The field type IDs for each value as #GArrowInt8Array.
* @fields: (element-type GArrowArray): The arrays for each field
* as #GList of #GArrowArray.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: (nullable): A newly created #GArrowSparseUnionArray
* or %NULL on error.
*
* Since: 0.14.0
*/
GArrowSparseUnionArray *
garrow_sparse_union_array_new_data_type(GArrowSparseUnionDataType *data_type,
GArrowInt8Array *type_ids,
GList *fields,
GError **error)
{
auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type));
auto arrow_union_data_type =
std::static_pointer_cast<arrow::UnionType>(arrow_data_type);
std::vector<std::string> arrow_field_names;
for (const auto &arrow_field : arrow_union_data_type->children()) {
arrow_field_names.push_back(arrow_field->name());
}
std::vector<uint8_t> arrow_type_codes(arrow_union_data_type->type_codes());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use arrow_union_data_type->type_codes() directly for arrow::UnionArray::MakeSparse() instead of copying it?

auto arrow_type_ids = garrow_array_get_raw(GARROW_ARRAY(type_ids));
std::vector<std::shared_ptr<arrow::Array>> arrow_fields;
for (auto node = fields; node; node = node->next) {
auto *field = GARROW_ARRAY(node->data);
arrow_fields.push_back(garrow_array_get_raw(field));
}
std::shared_ptr<arrow::Array> arrow_union_array;
auto status = arrow::UnionArray::MakeSparse(*arrow_type_ids,
arrow_fields,
arrow_field_names,
arrow_type_codes,
&arrow_union_array);
if (garrow_error_check(error,
status,
"[sparse-union-array][new][data-type]")) {
return GARROW_SPARSE_UNION_ARRAY(garrow_array_new_raw(&arrow_union_array));
} else {
return NULL;
}
}


G_DEFINE_TYPE(GArrowDenseUnionArray,
garrow_dense_union_array,
Expand Down Expand Up @@ -420,6 +468,57 @@ garrow_dense_union_array_new(GArrowInt8Array *type_ids,
}
}

/**
* garrow_dense_union_array_new_data_type:
* @data_type: The data type for the sparse array.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"sparse" -> "dense"

* @type_ids: The field type IDs for each value as #GArrowInt8Array.
* @value_offsets: The value offsets for each value as #GArrowInt32Array.
* Each offset is counted for each type.
* @fields: (element-type GArrowArray): The arrays for each field
* as #GList of #GArrowArray.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: (nullable): A newly created #GArrowSparseUnionArray
* or %NULL on error.
*
* Since: 0.14.0
*/
GArrowDenseUnionArray *
garrow_dense_union_array_new_data_type(GArrowDenseUnionDataType *data_type,
GArrowInt8Array *type_ids,
GArrowInt32Array *value_offsets,
GList *fields,
GError **error)
{
auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type));
auto arrow_union_data_type =
std::static_pointer_cast<arrow::UnionType>(arrow_data_type);
std::vector<std::string> arrow_field_names;
for (const auto &arrow_field : arrow_union_data_type->children()) {
arrow_field_names.push_back(arrow_field->name());
}
std::vector<uint8_t> arrow_type_codes(arrow_union_data_type->type_codes());
auto arrow_type_ids = garrow_array_get_raw(GARROW_ARRAY(type_ids));
auto arrow_value_offsets = garrow_array_get_raw(GARROW_ARRAY(value_offsets));
std::vector<std::shared_ptr<arrow::Array>> arrow_fields;
for (auto node = fields; node; node = node->next) {
auto *field = GARROW_ARRAY(node->data);
arrow_fields.push_back(garrow_array_get_raw(field));
}
std::shared_ptr<arrow::Array> arrow_union_array;
auto status = arrow::UnionArray::MakeDense(*arrow_type_ids,
*arrow_value_offsets,
arrow_fields,
arrow_field_names,
arrow_type_codes,
&arrow_union_array);
if (garrow_error_check(error, status, "[dense-union-array][new][data-type]")) {
return GARROW_DENSE_UNION_ARRAY(garrow_array_new_raw(&arrow_union_array));
} else {
return NULL;
}
}


G_DEFINE_TYPE(GArrowDictionaryArray,
garrow_dictionary_array,
Expand Down
11 changes: 11 additions & 0 deletions c_glib/arrow-glib/composite-array.h
Expand Up @@ -108,6 +108,11 @@ GArrowSparseUnionArray *
garrow_sparse_union_array_new(GArrowInt8Array *type_ids,
GList *fields,
GError **error);
GArrowSparseUnionArray *
garrow_sparse_union_array_new_data_type(GArrowSparseUnionDataType *data_type,
GArrowInt8Array *type_ids,
GList *fields,
GError **error);


#define GARROW_TYPE_DENSE_UNION_ARRAY (garrow_dense_union_array_get_type())
Expand All @@ -126,6 +131,12 @@ garrow_dense_union_array_new(GArrowInt8Array *type_ids,
GArrowInt32Array *value_offsets,
GList *fields,
GError **error);
GArrowDenseUnionArray *
garrow_dense_union_array_new_data_type(GArrowDenseUnionDataType *data_type,
GArrowInt8Array *type_ids,
GArrowInt32Array *value_offsets,
GList *fields,
GError **error);


#define GARROW_TYPE_DICTIONARY_ARRAY (garrow_dictionary_array_get_type())
Expand Down
86 changes: 61 additions & 25 deletions c_glib/test/test-dense-union-array.rb
Expand Up @@ -18,33 +18,69 @@
class TestDenseUnionArray < Test::Unit::TestCase
include Helper::Buildable

def setup
type_ids = build_int8_array([0, 1, nil, 1, 1])
value_offsets = build_int32_array([0, 0, 0, 1, 2])
fields = [
build_int16_array([1]),
build_string_array(["a", "b", "c"]),
]
@array = Arrow::DenseUnionArray.new(type_ids, value_offsets, fields)
end
sub_test_case(".new") do
def setup
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you put more sub test cases under ".new" sub test case?

sub_test_case(".new") do
  sub_test_case("default") do # or "no DataType"?
  end

  sub_test_case("DataType") do
  end
end

type_ids = build_int8_array([0, 1, nil, 1, 1])
value_offsets = build_int32_array([0, 0, 0, 1, 2])
fields = [
build_int16_array([1]),
build_string_array(["a", "b", "c"]),
]
@array = Arrow::DenseUnionArray.new(type_ids, value_offsets, fields)
end

def test_value_data_type
fields = [
Arrow::Field.new("0", Arrow::Int16DataType.new),
Arrow::Field.new("1", Arrow::StringDataType.new),
]
assert_equal(Arrow::DenseUnionDataType.new(fields, [0, 1]),
@array.value_data_type)
end

def test_value_data_type
fields = [
Arrow::Field.new("0", Arrow::Int16DataType.new),
Arrow::Field.new("1", Arrow::StringDataType.new),
]
assert_equal(Arrow::DenseUnionDataType.new(fields, [0, 1]),
@array.value_data_type)
def test_field
assert_equal([
build_int16_array([1]),
build_string_array(["a", "b", "c"]),
],
[
@array.get_field(0),
@array.get_field(1),
])
end
end

def test_field
assert_equal([
build_int16_array([1]),
build_string_array(["a", "b", "c"]),
],
[
@array.get_field(0),
@array.get_field(1),
])
sub_test_case("DataType") do
def setup
data_type_fields = [
Arrow::Field.new("number", Arrow::Int16DataType.new),
Arrow::Field.new("text", Arrow::StringDataType.new),
]
type_codes = [11, 13]
@data_type = Arrow::DenseUnionDataType.new(data_type_fields, type_codes)
type_ids = build_int8_array([0, 1, nil, 1, 0])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this right?
[11, 13, nil, 13, 13]?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, type ids must be [11, 13, nil, 13, 11].
But, it doesn't affect the tests so I couldn't notice the mistake.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add element accessor to union arrays and test union array values later.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand it.

value_offsets = build_int32_array([0, 0, 0, 1, 2])
fields = [
build_int16_array([1]),
build_string_array(["a", "b", "c"])
]
@array = Arrow::DenseUnionArray.new(@data_type, type_ids, value_offsets, fields)
end

def test_value_data_type
assert_equal(@data_type,
@array.value_data_type)
end

def test_field
assert_equal([
build_int16_array([1]),
build_string_array(["a", "b", "c"]),
],
[
@array.get_field(0),
@array.get_field(1),
])
end
end
end
83 changes: 59 additions & 24 deletions c_glib/test/test-sparse-union-array.rb
Expand Up @@ -18,32 +18,67 @@
class TestSparseUnionArray < Test::Unit::TestCase
include Helper::Buildable

def setup
type_ids = build_int8_array([0, 1, nil, 1, 0])
fields = [
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
]
@array = Arrow::SparseUnionArray.new(type_ids, fields)
end
sub_test_case(".new") do
def setup
type_ids = build_int8_array([0, 1, nil, 1, 0])
fields = [
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
]
@array = Arrow::SparseUnionArray.new(type_ids, fields)
end

def test_value_data_type
fields = [
Arrow::Field.new("0", Arrow::Int16DataType.new),
Arrow::Field.new("1", Arrow::StringDataType.new),
]
assert_equal(Arrow::SparseUnionDataType.new(fields, [0, 1]),
@array.value_data_type)
end

def test_value_data_type
fields = [
Arrow::Field.new("0", Arrow::Int16DataType.new),
Arrow::Field.new("1", Arrow::StringDataType.new),
]
assert_equal(Arrow::SparseUnionDataType.new(fields, [0, 1]),
@array.value_data_type)
def test_field
assert_equal([
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
],
[
@array.get_field(0),
@array.get_field(1),
])
end
end

def test_field
assert_equal([
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
],
[
@array.get_field(0),
@array.get_field(1),
])
sub_test_case("DataType") do
def setup
data_type_fields = [
Arrow::Field.new("number", Arrow::Int16DataType.new),
Arrow::Field.new("text", Arrow::StringDataType.new),
]
type_codes = [11, 13]
@data_type = Arrow::SparseUnionDataType.new(data_type_fields, type_codes)
type_ids = build_int8_array([0, 1, nil, 1, 0])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this right?
[11, 13, nil, 13, 11]?

fields = [
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
]
@array = Arrow::SparseUnionArray.new(@data_type, type_ids, fields)
end

def test_value_data_type
assert_equal(@data_type,
@array.value_data_type)
end

def test_field
assert_equal([
build_int16_array([1, nil, nil, nil, 5]),
build_string_array([nil, "b", nil, "d", nil]),
],
[
@array.get_field(0),
@array.get_field(1),
])
end
end
end
Expand Up @@ -69,12 +69,8 @@ def build_record_batch(type, records)
offsets << (type_ids.count(type_id) - 1)
end
end
# TODO
# union_array = Arrow::DenseUnionArray.new(schema.fields[0].data_type,
# Arrow::Int8Array.new(type_ids),
# Arrow::Int32Array.new(offsets),
# arrays)
union_array = Arrow::DenseUnionArray.new(Arrow::Int8Array.new(type_ids),
union_array = Arrow::DenseUnionArray.new(schema.fields[0].data_type,
Arrow::Int8Array.new(type_ids),
Arrow::Int32Array.new(offsets),
arrays)
schema = Arrow::Schema.new(column: union_array.value_data_type)
Expand Down
Expand Up @@ -59,11 +59,8 @@ def build_record_batch(type, records)
type_ids << type_codes[1]
end
end
# TODO
# union_array = Arrow::SparseUnionArray.new(schema.fields[0].data_type,
# Arrow::Int8Array.new(type_ids),
# arrays)
union_array = Arrow::SparseUnionArray.new(Arrow::Int8Array.new(type_ids),
union_array = Arrow::SparseUnionArray.new(schema.fields[0].data_type,
Arrow::Int8Array.new(type_ids),
arrays)
schema = Arrow::Schema.new(column: union_array.value_data_type)
Arrow::RecordBatch.new(schema,
Expand Down