Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-36420: [C++] Add An Enum Option For SetLookup Options #36739

Merged
merged 46 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
692bf1b
In Function Init
R-JunmingChen Jul 18, 2023
7098b4f
Update api_scalar.cc
R-JunmingChen Jul 18, 2023
19798a1
Update api_scalar.h
R-JunmingChen Jul 18, 2023
30ac5fe
Update scalar_set_lookup.cc
R-JunmingChen Jul 18, 2023
3ef86f5
fix bug
R-JunmingChen Jul 19, 2023
657db1f
Merge branch 'ARROW-36420' of https://github.com/R-JunmingChen/arrow …
R-JunmingChen Jul 19, 2023
60bbb14
fix bug2
R-JunmingChen Jul 19, 2023
b79a4dc
isin init
R-JunmingChen Aug 18, 2023
5be997c
ARROW-36420
R-JunmingChen Aug 18, 2023
52e1dfb
fix bug
R-JunmingChen Aug 20, 2023
f62c50f
roll back
R-JunmingChen Aug 20, 2023
712454f
lint
R-JunmingChen Aug 20, 2023
0ea3bdf
doc test
R-JunmingChen Aug 20, 2023
5d7602b
set g_lib
R-JunmingChen Aug 20, 2023
f6c4159
glib2
R-JunmingChen Aug 20, 2023
89d7759
Merge branch 'main' of https://github.com/R-JunmingChen/arrow into AR…
R-JunmingChen Aug 21, 2023
b61ba64
add a test
R-JunmingChen Aug 21, 2023
d409b7e
add a test2
R-JunmingChen Aug 21, 2023
a0a511d
add a test 3
R-JunmingChen Aug 21, 2023
aa9eb01
add a test4
R-JunmingChen Aug 21, 2023
f129754
lint
R-JunmingChen Aug 21, 2023
bf7c145
Update c_glib/arrow-glib/compute.cpp
R-JunmingChen Aug 21, 2023
bb55eda
Update c_glib/arrow-glib/compute.cpp
R-JunmingChen Aug 21, 2023
039eeed
Update c_glib/arrow-glib/compute.cpp
R-JunmingChen Aug 21, 2023
bba645b
doc
R-JunmingChen Aug 23, 2023
e2aa28a
fix comments
R-JunmingChen Aug 29, 2023
cfbb4ce
fix comments 2
R-JunmingChen Aug 30, 2023
021223b
DEPRECATED
R-JunmingChen Aug 30, 2023
76b353a
Deprecated update
R-JunmingChen Aug 30, 2023
4338eac
roll back
R-JunmingChen Aug 30, 2023
e61278b
roll back
R-JunmingChen Aug 30, 2023
14fcc44
Update _compute.pyx
R-JunmingChen Aug 30, 2023
f26892a
Update scalar_set_lookup.cc
R-JunmingChen Sep 4, 2023
ef4d4b9
Update cpp/src/arrow/compute/api_scalar.h
R-JunmingChen Sep 20, 2023
9628bf2
Update cpp/src/arrow/compute/api_scalar.h
R-JunmingChen Sep 20, 2023
f057ca6
move comment
R-JunmingChen Sep 20, 2023
982c17f
Merge branch 'ARROW-36420' of https://github.com/R-JunmingChen/arrow …
R-JunmingChen Sep 20, 2023
abfe890
NotImplemented
R-JunmingChen Sep 20, 2023
2115e22
return
R-JunmingChen Sep 20, 2023
2352e42
add default test
R-JunmingChen Sep 20, 2023
e71fc27
add test for isin
R-JunmingChen Sep 20, 2023
d161639
add test for indexin
R-JunmingChen Sep 20, 2023
8cfe97c
CoercedDataMember
R-JunmingChen Sep 20, 2023
2c9fa9c
lint
R-JunmingChen Sep 20, 2023
1b09661
expression test
R-JunmingChen Sep 20, 2023
38c1a04
python expression
R-JunmingChen Sep 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cpp/src/arrow/compute/api_scalar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,14 @@ Result<Datum> MinElementWise(const std::vector<Datum>& args,
// ----------------------------------------------------------------------
// Set-related operations

Result<Datum> In(const Datum& values, const SetLookupOptions& options, ExecContext* ctx) {
return CallFunction("in", {values}, &options, ctx);
}

Result<Datum> In(const Datum& values, const Datum& value_set, ExecContext* ctx) {
return In(values, SetLookupOptions{value_set}, ctx);
}

Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
ExecContext* ctx) {
return CallFunction("is_in", {values}, &options, ctx);
Expand Down
22 changes: 22 additions & 0 deletions cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,28 @@ ARROW_EXPORT
Result<Datum> KleeneAndNot(const Datum& left, const Datum& right,
ExecContext* ctx = NULLPTR);

/// \brief In returns true for each element of `values` that is contained in
/// `value_set`.
/// In is sql-compatible, null in `values` will directly output null,
/// each elelement of `values` that isn't contained in `value_set`
/// will output null if the `value_set` contains null and output
/// false if the `value_set` doesn't contain null.
///
/// In ignore the parameter skip_nulls in SetLookupOptions.
///
/// \param[in] values array-like input to look up in value_set
/// \param[in] options SetLookupOptions
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 12.0.1
/// \note API not yet finalized
ARROW_EXPORT
Result<Datum> In(const Datum& values, const SetLookupOptions& options,
ExecContext* ctx = NULLPTR);
ARROW_EXPORT
Result<Datum> In(const Datum& values, const Datum& value_set, ExecContext* ctx = NULLPTR);

/// \brief IsIn returns true for each element of `values` that is contained in
/// `value_set`
///
Expand Down
167 changes: 167 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,120 @@ Status ExecIndexIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
}

// ----------------------------------------------------------------------
// In writes the results into a preallocated boolean data bitmap
struct InVisitor {
KernelContext* ctx;
const ArraySpan& data;
ArraySpan* out;
uint8_t* out_boolean_bitmap;
uint8_t* out_null_bitmap;

InVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out)
: ctx(ctx),
data(data),
out(out),
out_boolean_bitmap(out->buffers[1].data),
out_null_bitmap(out->buffers[0].data) {}

Status Visit(const DataType& type) {
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
DCHECK_EQ(type.id(), Type::NA);
// skip_nulls is ignored in sql-compatible In
bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, false);
bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, true);

return Status::OK();
}

template <typename Type>
Status ProcessIsIn(const SetLookupState<Type>& state, const ArraySpan& input) {
using T = typename GetViewType<Type>::T;
FirstTimeBitmapWriter writer_boolean(out_boolean_bitmap, out->offset, out->length);
FirstTimeBitmapWriter writer_null(out_null_bitmap, out->offset, out->length);
bool value_set_has_null = state.null_index != -1;
VisitArraySpanInline<Type>(
input,
[&](T v) {
if (state.lookup_table->Get(v) != -1) {
writer_boolean.Set();
writer_null.Clear();
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
} else if (value_set_has_null) {
writer_boolean.Clear();
writer_null.Set();
} else {
writer_boolean.Clear();
writer_null.Clear();
}
writer_boolean.Next();
writer_null.Next();
},
[&]() {
writer_boolean.Clear();
writer_null.Set();
writer_boolean.Next();
writer_null.Next();
});
writer_boolean.Finish();
writer_null.Finish();
return Status::OK();
}

template <typename Type>
Status ProcessIsIn() {
const auto& state = checked_cast<const SetLookupState<Type>&>(*ctx->state());

if (!data.type->Equals(state.value_set_type)) {
auto materialized_input = data.ToArrayData();
auto cast_result = Cast(*materialized_input, state.value_set_type,
CastOptions::Safe(), ctx->exec_context());
if (ARROW_PREDICT_FALSE(!cast_result.ok())) {
if (cast_result.status().IsNotImplemented()) {
return Status::TypeError("Array type doesn't match type of values set: ",
*data.type, " vs ", *state.value_set_type);
}
return cast_result.status();
}
auto casted_input = *cast_result;
return ProcessIsIn(state, *casted_input.array());
}
return ProcessIsIn(state, data);
}

template <typename Type>
enable_if_boolean<Type, Status> Visit(const Type&) {
return ProcessIsIn<BooleanType>();
}

template <typename Type>
enable_if_t<has_c_type<Type>::value && !is_boolean_type<Type>::value &&
!std::is_same<Type, MonthDayNanoIntervalType>::value,
Status>
Visit(const Type&) {
return ProcessIsIn<typename UnsignedIntType<sizeof(typename Type::c_type)>::Type>();
}

template <typename Type>
enable_if_base_binary<Type, Status> Visit(const Type&) {
return ProcessIsIn<typename Type::PhysicalType>();
}

// Handle Decimal128Type, FixedSizeBinaryType
Status Visit(const FixedSizeBinaryType& type) {
return ProcessIsIn<FixedSizeBinaryType>();
}

Status Visit(const MonthDayNanoIntervalType& type) {
return ProcessIsIn<MonthDayNanoIntervalType>();
}

Status Execute() {
const auto& state = checked_cast<const SetLookupStateBase&>(*ctx->state());
return VisitTypeInline(*state.value_set_type, this);
}
};

Status ExecIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
return InVisitor(ctx, batch[0].array, out->array_span_mutable()).Execute();
}

// IsIn writes the results into a preallocated boolean data bitmap
struct IsInVisitor {
Expand Down Expand Up @@ -516,6 +630,28 @@ void AddBasicSetLookupKernels(ScalarKernel kernel,
}
}

const FunctionDoc in_doc{
"Find each element in a set of values in a sql-compatible way",
("For each element in `values`, return true if it is found in a given\n"
"set of values. null in `values` will directly return null.\n"
"each elelement of `values` that isn't contained in the set of values\n"
"will return null if the the set of values contains null and return\n"
"false if the the set of values doesn't contain null.\n"
"The set of values to look for must be given in SetLookupOptions.\n"
"the parameter skip_nulls in SetLookupOptions is ignored in this Function"),
{"values"},
"SetLookupOptions",
/*options_required=*/true};

const FunctionDoc in_meta_doc{
"Find each element in a set of values in a sql-compatible way",
("For each element in `values`, return true if it is found in `value_set`,\n"
"null in `values` will directly return null.\n"
"each elelement of `values` that isn't contained in `value_set`\n"
"will return null if the `value_set` contain null and return\n"
"false if the `value_set` doesn't contains null."),
{"values", "value_set"}};

const FunctionDoc is_in_doc{
"Find each element in a set of values",
("For each element in `values`, return true if it is found in a given\n"
Expand Down Expand Up @@ -550,6 +686,20 @@ const FunctionDoc index_in_meta_doc{
"or null if it is not found there."),
{"values", "value_set"}};

class InMetaBinary : public MetaFunction {
public:
InMetaBinary() : MetaFunction("in_meta_binary", Arity::Binary(), in_meta_doc) {}

Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
ExecContext* ctx) const override {
if (options != nullptr) {
return Status::Invalid("Unexpected options for 'in_meta_binary' function");
}
return In(args[0], args[1], ctx);
}
};

// Enables calling is_in with CallFunction as though it were binary.
class IsInMetaBinary : public MetaFunction {
public:
Expand Down Expand Up @@ -593,6 +743,23 @@ struct SetLookupFunction : ScalarFunction {
} // namespace

void RegisterScalarSetLookup(FunctionRegistry* registry) {
// In writes its boolean output into preallocated memory
{
ScalarKernel in_base;
in_base.init = InitSetLookup;
in_base.exec = ExecIn;
in_base.null_handling = NullHandling::COMPUTED_PREALLOCATE;
auto in = std::make_shared<SetLookupFunction>("in", Arity::Unary(), in_doc);

AddBasicSetLookupKernels(in_base, /*output_type=*/boolean(), in.get());

in_base.signature = KernelSignature::Make({null()}, boolean());
DCHECK_OK(in->AddKernel(in_base));
DCHECK_OK(registry->AddFunction(in));

DCHECK_OK(registry->AddFunction(std::make_shared<InMetaBinary>()));
}

// IsIn writes its boolean output into preallocated memory
{
ScalarKernel isin_base;
Expand Down
Loading