Skip to content

Commit

Permalink
ARROW-16757: [C++] Remove "scalar" output modality for ScalarKernel i…
Browse files Browse the repository at this point in the history
…mplementations, remove ValueDescr class (#13521)

This was a pretty painful refactoring project, but all for the better long term. 

I will do my best to improve the PR summary to explain changes as I go through the PR myself, but the basic summary is that argument shape is no longer part of kernel signatures (or expressions or anything else) -- the "all scalar" evaluation case is handled by promoting scalars to ArraySpan of length 1. This enabled the deletion of many "duplicate" kernel implementations and uncovered some bugs in some kernel implementations as a result of passing many sliced length-1 inputs up-promoted from scalars. 

This had painful knock-on effects because some scalar values -- sparse unions and some other non-primitive types -- were not "well-formed" enough to be able to be converted to ArraySpan without needing to allocate memory. As a result, I had to change the requirements (and in the case of sparse unions, the representation to correspond to a "single row view" of a SparseUnionArray -- i.e. the "value" for a sparse union is now a vector of scalars rather than a single scalar, even though only one of the values in the vector is the "active" scalar) of some scalar values to not allow null values. We should / could consider implementing scalars altogether internally as arrays of length 1 but we can discuss that as a follow up.

Some other interesting stuff in here is the new experimental `arrow::TypeHolder` struct which allows us to pass around either `const DataType*` or `std::shared_ptr<DataType>` in contexts where type ownership isn't strictly necessary. So now we can deal only with raw pointers when doing type signature checking / kernel dispatching, for example. We should look at refactoring other code where we deal with `shared_ptr<DataType>` (but don't necessarily need to preserve ownership) to use TypeHolder. 

There's some unfinished business here:

* The GLib bindings must be refactored to follow this work

Lead-authored-by: Wes McKinney <wesm@apache.org>
Co-authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Wes McKinney <wesm@apache.org>
  • Loading branch information
wesm and kou committed Jul 8, 2022
1 parent 922f58f commit 6cc37cf
Show file tree
Hide file tree
Showing 118 changed files with 3,164 additions and 4,205 deletions.
9 changes: 3 additions & 6 deletions c_glib/arrow-glib/compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5284,12 +5284,9 @@ GArrowCastOptions *
garrow_cast_options_new_raw(const arrow::compute::CastOptions *arrow_options)
{
GArrowDataType *to_data_type = NULL;
if (arrow_options->to_type) {
auto arrow_copied_options = arrow_options->Copy();
auto arrow_copied_cast_options =
static_cast<arrow::compute::CastOptions *>(arrow_copied_options.get());
to_data_type =
garrow_data_type_new_raw(&(arrow_copied_cast_options->to_type));
if (arrow_options->to_type.type) {
auto arrow_to_data_type = arrow_options->to_type.GetSharedPtr();
to_data_type = garrow_data_type_new_raw(&arrow_to_data_type);
}
auto options =
g_object_new(GARROW_TYPE_CAST_OPTIONS,
Expand Down
44 changes: 38 additions & 6 deletions c_glib/arrow-glib/scalar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2401,9 +2401,31 @@ garrow_sparse_union_scalar_new(GArrowSparseUnionDataType *data_type,
gint8 type_code,
GArrowScalar *value)
{
return GARROW_SPARSE_UNION_SCALAR(
garrow_union_scalar_new<arrow::SparseUnionScalar>(
GARROW_DATA_TYPE(data_type), type_code, value));
auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type));
const auto &arrow_type_codes =
std::dynamic_pointer_cast<arrow::SparseUnionType>(
arrow_data_type)->type_codes();
auto arrow_value = garrow_scalar_get_raw(value);
arrow::SparseUnionScalar::ValueType arrow_field_values;
for (int i = 0; i < arrow_data_type->num_fields(); ++i) {
if (arrow_type_codes[i] == type_code) {
arrow_field_values.emplace_back(arrow_value);
} else {
arrow_field_values.emplace_back(
arrow::MakeNullScalar(arrow_data_type->field(i)->type()));
}
}
auto arrow_scalar =
std::static_pointer_cast<arrow::Scalar>(
std::make_shared<arrow::SparseUnionScalar>(arrow_field_values,
type_code,
arrow_data_type));
auto scalar = garrow_scalar_new_raw(&arrow_scalar,
"scalar", &arrow_scalar,
"data-type", data_type,
"value", value,
NULL);
return GARROW_SPARSE_UNION_SCALAR(scalar);
}


Expand Down Expand Up @@ -2436,9 +2458,19 @@ garrow_dense_union_scalar_new(GArrowDenseUnionDataType *data_type,
gint8 type_code,
GArrowScalar *value)
{
return GARROW_DENSE_UNION_SCALAR(
garrow_union_scalar_new<arrow::DenseUnionScalar>(
GARROW_DATA_TYPE(data_type), type_code, value));
auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type));
auto arrow_value = garrow_scalar_get_raw(value);
auto arrow_scalar =
std::static_pointer_cast<arrow::Scalar>(
std::make_shared<arrow::DenseUnionScalar>(arrow_value,
type_code,
arrow_data_type));
auto scalar = garrow_scalar_new_raw(&arrow_scalar,
"scalar", &arrow_scalar,
"data-type", data_type,
"value", value,
NULL);
return GARROW_DENSE_UNION_SCALAR(scalar);
}


Expand Down
6 changes: 5 additions & 1 deletion c_glib/test/test-large-binary-scalar.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ def test_equal
end

def test_to_s
assert_equal("...", @scalar.to_s)
assert_equal(<<-BINARY.strip, @scalar.to_s)
[
030102
]
BINARY
end

def test_value
Expand Down
6 changes: 5 additions & 1 deletion c_glib/test/test-large-string-scalar.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ def test_equal
end

def test_to_s
assert_equal("...", @scalar.to_s)
assert_equal(<<-STRING.strip, @scalar.to_s)
[
"Hello"
]
STRING
end

def test_value
Expand Down
12 changes: 11 additions & 1 deletion c_glib/test/test-list-scalar.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,17 @@ def test_equal
end

def test_to_s
assert_equal("...", @scalar.to_s)
assert_equal(<<-LIST.strip, @scalar.to_s)
[
[
[
1,
2,
3
]
]
]
LIST
end

def test_value
Expand Down
15 changes: 14 additions & 1 deletion c_glib/test/test-map-scalar.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,20 @@ def test_equal
end

def test_to_s
assert_equal("...", @scalar.to_s)
assert_equal(<<-MAP.strip, @scalar.to_s)
[
keys:
[
"hello",
"world"
]
values:
[
1,
2
]
]
MAP
end

def test_value
Expand Down
3 changes: 1 addition & 2 deletions cpp/examples/arrow/compute_register_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,7 @@ const cp::FunctionDoc func_doc{
int main(int argc, char** argv) {
const std::string name = "compute_register_example";
auto func = std::make_shared<cp::ScalarFunction>(name, cp::Arity::Unary(), func_doc);
cp::ScalarKernel kernel({cp::InputType::Array(arrow::int64())}, arrow::int64(),
ExampleFunctionImpl);
cp::ScalarKernel kernel({arrow::int64()}, arrow::int64(), ExampleFunctionImpl);
kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE;
ABORT_ON_FAILURE(func->AddKernel(std::move(kernel)));

Expand Down
6 changes: 2 additions & 4 deletions cpp/examples/arrow/udf_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,8 @@ arrow::Status SampleFunction(cp::KernelContext* ctx, const cp::ExecSpan& batch,
arrow::Status Execute() {
const std::string name = "add_three";
auto func = std::make_shared<cp::ScalarFunction>(name, cp::Arity::Ternary(), func_doc);
cp::ScalarKernel kernel(
{cp::InputType::Array(arrow::int64()), cp::InputType::Array(arrow::int64()),
cp::InputType::Array(arrow::int64())},
arrow::int64(), SampleFunction);
cp::ScalarKernel kernel({arrow::int64(), arrow::int64(), arrow::int64()},
arrow::int64(), SampleFunction);

kernel.mem_allocation = cp::MemAllocation::PREALLOCATE;
kernel.null_handling = cp::NullHandling::INTERSECTION;
Expand Down
40 changes: 32 additions & 8 deletions cpp/gdb_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1406,13 +1406,12 @@ class FixedSizeBinaryScalarPrinter(BaseBinaryScalarPrinter):

def to_string(self):
size = self.type['byte_width_']
if not self.is_valid:
return f"{self._format_type()} of size {size}, null value"
bufptr = BufferPtr(SharedPtr(self.val['value']).get())
if bufptr.data is None:
return f"{self._format_type()} of size {size}, <unallocated>"
nullness = '' if self.is_valid else 'null with '
return (f"{self._format_type()} of size {size}, "
f"value {self._format_buf(bufptr)}")
f"{nullness}value {self._format_buf(bufptr)}")


class DictionaryScalarPrinter(ScalarPrinter):
Expand Down Expand Up @@ -1450,6 +1449,8 @@ def display_hint(self):
return 'map'

def children(self):
if not self.is_valid:
return None
eval_fields = StdVector(self.type['children_'])
eval_values = StdVector(self.val['value'])
for field, value in zip(eval_fields, eval_values):
Expand All @@ -1463,7 +1464,24 @@ def to_string(self):
return f"{self._format_type()}"


class UnionScalarPrinter(ScalarPrinter):
class SparseUnionScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::UnionScalar and subclasses.
"""

def to_string(self):
type_code = self.val['type_code'].cast(gdb.lookup_type('int'))
if not self.is_valid:
return (f"{self._format_type()} of type {self.type}, "
f"type code {type_code}, null value")
eval_values = StdVector(self.val['value'])
child_id = self.val['child_id'].cast(gdb.lookup_type('int'))
return (f"{self._format_type()} of type code {type_code}, "
f"value {deref(eval_values[child_id])}")



class DenseUnionScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::UnionScalar and subclasses.
"""
Expand Down Expand Up @@ -1968,10 +1986,16 @@ class StructTypeClass(DataTypeClass):
scalar_printer = StructScalarPrinter


class UnionTypeClass(DataTypeClass):
class DenseUnionTypeClass(DataTypeClass):
is_parametric = True
type_printer = UnionTypePrinter
scalar_printer = DenseUnionScalarPrinter


class SparseUnionTypeClass(DataTypeClass):
is_parametric = True
type_printer = UnionTypePrinter
scalar_printer = UnionScalarPrinter
scalar_printer = SparseUnionScalarPrinter


class DictionaryTypeClass(DataTypeClass):
Expand Down Expand Up @@ -2037,8 +2061,8 @@ class ExtensionTypeClass(DataTypeClass):
Type.MAP: DataTypeTraits(MapTypeClass, 'MapType'),

Type.STRUCT: DataTypeTraits(StructTypeClass, 'StructType'),
Type.SPARSE_UNION: DataTypeTraits(UnionTypeClass, 'SparseUnionType'),
Type.DENSE_UNION: DataTypeTraits(UnionTypeClass, 'DenseUnionType'),
Type.SPARSE_UNION: DataTypeTraits(SparseUnionTypeClass, 'SparseUnionType'),
Type.DENSE_UNION: DataTypeTraits(DenseUnionTypeClass, 'DenseUnionType'),

Type.DICTIONARY: DataTypeTraits(DictionaryTypeClass, 'DictionaryType'),
Type.EXTENSION: DataTypeTraits(ExtensionTypeClass, 'ExtensionType'),
Expand Down
23 changes: 9 additions & 14 deletions cpp/src/arrow/array/array_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,16 +104,15 @@ struct ScalarFromArraySlotImpl {
}

Status Visit(const SparseUnionArray& a) {
const auto type_code = a.type_code(index_);
// child array which stores the actual value
const auto arr = a.field(a.child_id(index_));
// no need to adjust the index
ARROW_ASSIGN_OR_RAISE(auto value, arr->GetScalar(index_));
if (value->is_valid) {
out_ = std::shared_ptr<Scalar>(new SparseUnionScalar(value, type_code, a.type()));
} else {
out_ = std::shared_ptr<Scalar>(new SparseUnionScalar(type_code, a.type()));
int8_t type_code = a.type_code(index_);

ScalarVector children;
for (int i = 0; i < a.type()->num_fields(); ++i) {
children.emplace_back();
ARROW_ASSIGN_OR_RAISE(children.back(), a.field(i)->GetScalar(index_));
}

out_ = std::make_shared<SparseUnionScalar>(std::move(children), type_code, a.type());
return Status::OK();
}

Expand All @@ -124,11 +123,7 @@ struct ScalarFromArraySlotImpl {
// need to look up the value based on offsets
auto offset = a.value_offset(index_);
ARROW_ASSIGN_OR_RAISE(auto value, arr->GetScalar(offset));
if (value->is_valid) {
out_ = std::shared_ptr<Scalar>(new DenseUnionScalar(value, type_code, a.type()));
} else {
out_ = std::shared_ptr<Scalar>(new DenseUnionScalar(type_code, a.type()));
}
out_ = std::make_shared<DenseUnionScalar>(value, type_code, a.type());
return Status::OK();
}

Expand Down
12 changes: 6 additions & 6 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -561,16 +561,16 @@ static ScalarVector GetScalars() {
},
struct_({field("min", int32()), field("max", int32())})),
// Same values, different union type codes
std::make_shared<SparseUnionScalar>(std::make_shared<Int32Scalar>(100), 6,
sparse_union_ty),
std::make_shared<SparseUnionScalar>(std::make_shared<Int32Scalar>(100), 42,
sparse_union_ty),
std::make_shared<SparseUnionScalar>(42, sparse_union_ty),
SparseUnionScalar::FromValue(std::make_shared<Int32Scalar>(100), 1,
sparse_union_ty),
SparseUnionScalar::FromValue(std::make_shared<Int32Scalar>(100), 2,
sparse_union_ty),
SparseUnionScalar::FromValue(MakeNullScalar(int32()), 2, sparse_union_ty),
std::make_shared<DenseUnionScalar>(std::make_shared<Int32Scalar>(101), 6,
dense_union_ty),
std::make_shared<DenseUnionScalar>(std::make_shared<Int32Scalar>(101), 42,
dense_union_ty),
std::make_shared<DenseUnionScalar>(42, dense_union_ty),
std::make_shared<DenseUnionScalar>(MakeNullScalar(int32()), 42, dense_union_ty),
DictionaryScalar::Make(ScalarFromJSON(int8(), "1"),
ArrayFromJSON(utf8(), R"(["foo", "bar"])")),
DictionaryScalar::Make(ScalarFromJSON(uint8(), "1"),
Expand Down
Loading

0 comments on commit 6cc37cf

Please sign in to comment.