Skip to content

Commit

Permalink
Merge branch 'master' into r-group-convert
Browse files Browse the repository at this point in the history
  • Loading branch information
eitsupi committed Oct 13, 2022
2 parents 982f535 + 02c671a commit 225975e
Show file tree
Hide file tree
Showing 75 changed files with 2,806 additions and 480 deletions.
2 changes: 1 addition & 1 deletion ci/scripts/r_docker_configure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ if [[ -n "$DEVTOOLSET_VERSION" ]]; then
# Only add make var if not set
if ! grep -Fq "CXX17=" ~/.R/Makevars &> /dev/null; then
mkdir -p ~/.R
echo "CXX17=g++ -std=g++17 -g -O2 -fpic" >> ~/.R/Makevars
echo "CXX17=g++ -std=gnu++17 -g -O2 -fpic" >> ~/.R/Makevars
fi
fi

Expand Down
19 changes: 18 additions & 1 deletion cpp/src/arrow/compute/cast.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,25 @@ class CastMetaFunction : public MetaFunction {
// args[0].type() could be a nullptr so check for that before
// we do anything with it.
if (args[0].type() && args[0].type()->Equals(*cast_options->to_type)) {
return args[0];
// Nested types might differ in field names but still be considered equal,
// so we can only return non-nested types as-is.
if (!is_nested(args[0].type()->id())) {
return args[0];
} else if (args[0].is_array()) {
// TODO(ARROW-14999): if types are equal except for field names of list
// types, we can also use this code path.
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> array,
::arrow::internal::GetArrayView(
args[0].array(), cast_options->to_type.owned_type));
return Datum(array);
} else if (args[0].is_chunked_array()) {
ARROW_ASSIGN_OR_RAISE(
std::shared_ptr<ChunkedArray> array,
args[0].chunked_array()->View(cast_options->to_type.owned_type));
return Datum(array);
}
}

Result<std::shared_ptr<CastFunction>> result =
GetCastFunction(*cast_options->to_type);
if (!result.ok()) {
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/arrow/compute/exec/expression.cc
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,11 @@ std::string Expression::ToString() const {

if (call->options) {
out += call->options->ToString();
out.resize(out.size() + 1);
} else {
out.resize(out.size() - 1);
} else if (call->arguments.size()) {
out.resize(out.size() - 2);
}
out.back() = ')';

out += ')';
return out;
}

Expand Down
8 changes: 6 additions & 2 deletions cpp/src/arrow/compute/exec/expression_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,7 @@ TEST(Expression, ToString) {
"allow_time_overflow=false, allow_decimal_truncate=false, "
"allow_float_truncate=false, allow_invalid_utf8=false})");

// NB: corrupted for nullary functions but we don't have any of those
EXPECT_EQ(call("widgetify", {}).ToString(), "widgetif)");
EXPECT_EQ(call("widgetify", {}).ToString(), "widgetify()");
EXPECT_EQ(
call("widgetify", {literal(1)}, std::make_shared<WidgetifyOptions>()).ToString(),
"widgetify(1, widgetify)");
Expand All @@ -312,6 +311,11 @@ TEST(Expression, ToString) {
})
.ToString(),
"{a=a, renamed_a=a, three=3, b=" + in_12.ToString() + "}");

EXPECT_EQ(call("round", {literal(3.14)}, compute::RoundOptions()).ToString(),
"round(3.14, {ndigits=0, round_mode=HALF_TO_EVEN})");
EXPECT_EQ(call("random", {}, compute::RandomOptions()).ToString(),
"random({initializer=SystemRandom, seed=0})");
}

TEST(Expression, Equality) {
Expand Down
147 changes: 113 additions & 34 deletions cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,25 +40,22 @@ namespace {

// (Large)List<T> -> (Large)List<U>

template <typename SrcType, typename DestType>
typename std::enable_if<SrcType::type_id == DestType::type_id, Status>::type
CastListOffsets(KernelContext* ctx, const ArraySpan& in_array, ArrayData* out_array) {
return Status::OK();
}

// TODO(wesm): memory could be preallocated here and it would make
// things simpler
template <typename SrcType, typename DestType>
typename std::enable_if<SrcType::type_id != DestType::type_id, Status>::type
CastListOffsets(KernelContext* ctx, const ArraySpan& in_array, ArrayData* out_array) {
Status CastListOffsets(KernelContext* ctx, const ArraySpan& in_array,
ArrayData* out_array) {
using src_offset_type = typename SrcType::offset_type;
using dest_offset_type = typename DestType::offset_type;

ARROW_ASSIGN_OR_RAISE(out_array->buffers[1],
ctx->Allocate(sizeof(dest_offset_type) * (in_array.length + 1)));
::arrow::internal::CastInts(in_array.GetValues<src_offset_type>(1),
out_array->GetMutableValues<dest_offset_type>(1),
in_array.length + 1);
if constexpr (!std::is_same<src_offset_type, dest_offset_type>::value) {
ARROW_ASSIGN_OR_RAISE(out_array->buffers[1], ctx->Allocate(sizeof(dest_offset_type) *
(in_array.length + 1)));
::arrow::internal::CastInts(in_array.GetValues<src_offset_type>(1),
out_array->GetMutableValues<dest_offset_type>(1),
in_array.length + 1);
}

return Status::OK();
}

Expand All @@ -70,25 +67,10 @@ struct CastList {
static constexpr bool is_upcast = sizeof(src_offset_type) < sizeof(dest_offset_type);
static constexpr bool is_downcast = sizeof(src_offset_type) > sizeof(dest_offset_type);

static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const CastOptions& options = CastState::Get(ctx);

auto child_type = checked_cast<const DestType&>(*out->type()).value_type();

const ArraySpan& in_array = batch[0].array;
static Status HandleOffsets(KernelContext* ctx, const ArraySpan& in_array,
ArrayData* out_array, std::shared_ptr<ArrayData>* values) {
auto offsets = in_array.GetValues<src_offset_type>(1);

ArrayData* out_array = out->array_data().get();
out_array->buffers[0] = in_array.GetBuffer(0);
out_array->buffers[1] = in_array.GetBuffer(1);

// Shift bitmap in case the source offset is non-zero
if (in_array.offset != 0 && in_array.buffers[0].data != nullptr) {
ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
CopyBitmap(ctx->memory_pool(), in_array.buffers[0].data,
in_array.offset, in_array.length));
}

// Handle list offsets
// Several cases can arise:
// - the source offset is non-zero, in which case we slice the underlying values
Expand All @@ -103,8 +85,6 @@ struct CastList {
}
}

std::shared_ptr<ArrayData> values = in_array.child_data[0].ToArrayData();

if (in_array.offset != 0) {
ARROW_ASSIGN_OR_RAISE(
out_array->buffers[1],
Expand All @@ -115,11 +95,36 @@ struct CastList {
shifted_offsets[i] = static_cast<dest_offset_type>(offsets[i] - offsets[0]);
}

values = values->Slice(offsets[0], offsets[in_array.length]);
*values = (*values)->Slice(offsets[0], offsets[in_array.length]);
} else {
RETURN_NOT_OK((CastListOffsets<SrcType, DestType>(ctx, in_array, out_array)));
}

return Status::OK();
}

static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const CastOptions& options = CastState::Get(ctx);

auto child_type = checked_cast<const DestType&>(*out->type()).value_type();

const ArraySpan& in_array = batch[0].array;

ArrayData* out_array = out->array_data().get();
out_array->buffers[0] = in_array.GetBuffer(0);
out_array->buffers[1] = in_array.GetBuffer(1);

std::shared_ptr<ArrayData> values = in_array.child_data[0].ToArrayData();

// Shift bitmap in case the source offset is non-zero
if (in_array.offset != 0 && in_array.buffers[0].data != nullptr) {
ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
CopyBitmap(ctx->memory_pool(), in_array.buffers[0].data,
in_array.offset, in_array.length));
}

RETURN_NOT_OK(HandleOffsets(ctx, in_array, out_array, &values));

// Handle values
ARROW_ASSIGN_OR_RAISE(Datum cast_values,
Cast(values, child_type, options, ctx->exec_context()));
Expand Down Expand Up @@ -237,6 +242,74 @@ void AddTypeToTypeCast(CastFunction* func) {
DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel)));
}

template <typename DestType>
struct CastMap {
using CastListImpl = CastList<MapType, DestType>;

static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const CastOptions& options = CastState::Get(ctx);

std::shared_ptr<DataType> entry_type =
checked_cast<const DestType&>(*out->type()).value_type();
// Assert is struct with two fields
if (!(entry_type->id() == Type::STRUCT && entry_type->num_fields() == 2)) {
return Status::TypeError(
"Map type must be cast to a list<struct> with exactly two fields.");
}
std::shared_ptr<DataType> key_type = entry_type->field(0)->type();
std::shared_ptr<DataType> value_type = entry_type->field(1)->type();

const ArraySpan& in_array = batch[0].array;

ArrayData* out_array = out->array_data().get();
out_array->buffers[0] = in_array.GetBuffer(0);
out_array->buffers[1] = in_array.GetBuffer(1);

std::shared_ptr<ArrayData> entries = in_array.child_data[0].ToArrayData();

// Shift bitmap in case the source offset is non-zero
if (in_array.offset != 0 && in_array.buffers[0].data != nullptr) {
ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
CopyBitmap(ctx->memory_pool(), in_array.buffers[0].data,
in_array.offset, in_array.length));
}

RETURN_NOT_OK(CastListImpl::HandleOffsets(ctx, in_array, out_array, &entries));

// Handle keys
const std::shared_ptr<ArrayData>& keys =
entries->child_data[0]->Slice(entries->offset, entries->length);
ARROW_ASSIGN_OR_RAISE(Datum cast_keys,
Cast(keys, key_type, options, ctx->exec_context()));
DCHECK(cast_keys.is_array());

// Handle values
const std::shared_ptr<ArrayData>& values =
entries->child_data[1]->Slice(entries->offset, entries->length);
ARROW_ASSIGN_OR_RAISE(Datum cast_values,
Cast(values, value_type, options, ctx->exec_context()));
DCHECK(cast_values.is_array());

// Create struct array
std::shared_ptr<ArrayData> struct_array =
ArrayData::Make(entry_type, /*length=*/entries->length, {nullptr},
{cast_keys.array(), cast_values.array()}, /*null_count=*/0);
out_array->child_data.push_back(struct_array);

return Status::OK();
}
};

template <typename DestType>
void AddMapCast(CastFunction* func) {
ScalarKernel kernel;
kernel.exec = CastMap<DestType>::Exec;
kernel.signature =
KernelSignature::Make({InputType(MapType::type_id)}, kOutputTargetType);
kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
DCHECK_OK(func->AddKernel(MapType::type_id, std::move(kernel)));
}

} // namespace

std::vector<std::shared_ptr<CastFunction>> GetNestedCasts() {
Expand All @@ -253,6 +326,12 @@ std::vector<std::shared_ptr<CastFunction>> GetNestedCasts() {
AddListCast<ListType, LargeListType>(cast_large_list.get());
AddListCast<LargeListType, LargeListType>(cast_large_list.get());

auto cast_map = std::make_shared<CastFunction>("cast_map", Type::MAP);
AddCommonCasts(Type::MAP, kOutputTargetType, cast_map.get());
AddMapCast<MapType>(cast_map.get());
AddMapCast<ListType>(cast_list.get());
AddMapCast<LargeListType>(cast_large_list.get());

// FSL is a bit incomplete at the moment
auto cast_fsl =
std::make_shared<CastFunction>("cast_fixed_size_list", Type::FIXED_SIZE_LIST);
Expand All @@ -269,7 +348,7 @@ std::vector<std::shared_ptr<CastFunction>> GetNestedCasts() {
std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dictionary.get());

return {cast_list, cast_large_list, cast_fsl, cast_struct, cast_dictionary};
return {cast_list, cast_large_list, cast_map, cast_fsl, cast_struct, cast_dictionary};
}

} // namespace internal
Expand Down
48 changes: 48 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2276,6 +2276,54 @@ TEST(Cast, FSLToFSLOptionsPassThru) {
CheckCast(fsl_int32, ArrayFromJSON(fixed_size_list(int16(), 1), "[[32689]]"), options);
}

TEST(Cast, CastMap) {
const std::string map_json =
"[[[\"x\", 1], [\"y\", 8], [\"z\", 9]], [[\"x\", 6]], [[\"y\", 36]]]";
const std::string map_json_nullable =
"[[[\"x\", 1], [\"y\", null], [\"z\", 9]], null, [[\"y\", 36]]]";

auto CheckMapCast = [map_json,
map_json_nullable](const std::shared_ptr<DataType>& dst_type) {
std::shared_ptr<DataType> src_type =
std::make_shared<MapType>(field("x", utf8(), false), field("y", int64()));
std::shared_ptr<Array> src = ArrayFromJSON(src_type, map_json);
std::shared_ptr<Array> dst = ArrayFromJSON(dst_type, map_json);
CheckCast(src, dst);

src = ArrayFromJSON(src_type, map_json_nullable);
dst = ArrayFromJSON(dst_type, map_json_nullable);
CheckCast(src, dst);
};

// Can rename fields
CheckMapCast(std::make_shared<MapType>(field("a", utf8(), false), field("b", int64())));
// Can map keys and values
CheckMapCast(map(large_utf8(), field("y", int32())));
// Can cast a map to a to a list<struct<keys=.., values=..>>
CheckMapCast(list(struct_({field("a", utf8()), field("b", int64())})));
// Can cast a map to a large_list<struct<keys=.., values=..>>
CheckMapCast(large_list(struct_({field("a", utf8()), field("b", int64())})));

// Can rename nested field names
std::shared_ptr<DataType> src_type = map(utf8(), field("x", list(field("a", int64()))));
std::shared_ptr<DataType> dst_type = map(utf8(), field("y", list(field("b", int64()))));

std::shared_ptr<Array> src =
ArrayFromJSON(src_type, "[[[\"1\", [1,2,3]]], [[\"2\", [4,5,6]]]]");
std::shared_ptr<Array> dst =
ArrayFromJSON(dst_type, "[[[\"1\", [1,2,3]]], [[\"2\", [4,5,6]]]]");

CheckCast(src, dst);

// Cannot cast to a list<struct<[fields]>> if there are not exactly 2 fields
dst_type = list(
struct_({field("key", int32()), field("value", int64()), field("extra", int64())}));
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("must be cast to a list<struct> with exactly two fields"),
Cast(src, dst_type));
}

static void CheckStructToStruct(
const std::vector<std::shared_ptr<DataType>>& value_types) {
for (const auto& src_value_type : value_types) {
Expand Down
Loading

0 comments on commit 225975e

Please sign in to comment.