Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-36420: [C++] Add An Enum Option For SetLookup Options #36739

Merged
merged 46 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
692bf1b
In Function Init
R-JunmingChen Jul 18, 2023
7098b4f
Update api_scalar.cc
R-JunmingChen Jul 18, 2023
19798a1
Update api_scalar.h
R-JunmingChen Jul 18, 2023
30ac5fe
Update scalar_set_lookup.cc
R-JunmingChen Jul 18, 2023
3ef86f5
fix bug
R-JunmingChen Jul 19, 2023
657db1f
Merge branch 'ARROW-36420' of https://github.com/R-JunmingChen/arrow …
R-JunmingChen Jul 19, 2023
60bbb14
fix bug2
R-JunmingChen Jul 19, 2023
b79a4dc
isin init
R-JunmingChen Aug 18, 2023
5be997c
ARROW-36420
R-JunmingChen Aug 18, 2023
52e1dfb
fix bug
R-JunmingChen Aug 20, 2023
f62c50f
roll back
R-JunmingChen Aug 20, 2023
712454f
lint
R-JunmingChen Aug 20, 2023
0ea3bdf
doc test
R-JunmingChen Aug 20, 2023
5d7602b
set g_lib
R-JunmingChen Aug 20, 2023
f6c4159
glib2
R-JunmingChen Aug 20, 2023
89d7759
Merge branch 'main' of https://github.com/R-JunmingChen/arrow into AR…
R-JunmingChen Aug 21, 2023
b61ba64
add a test
R-JunmingChen Aug 21, 2023
d409b7e
add a test2
R-JunmingChen Aug 21, 2023
a0a511d
add a test 3
R-JunmingChen Aug 21, 2023
aa9eb01
add a test4
R-JunmingChen Aug 21, 2023
f129754
lint
R-JunmingChen Aug 21, 2023
bf7c145
Update c_glib/arrow-glib/compute.cpp
R-JunmingChen Aug 21, 2023
bb55eda
Update c_glib/arrow-glib/compute.cpp
R-JunmingChen Aug 21, 2023
039eeed
Update c_glib/arrow-glib/compute.cpp
R-JunmingChen Aug 21, 2023
bba645b
doc
R-JunmingChen Aug 23, 2023
e2aa28a
fix comments
R-JunmingChen Aug 29, 2023
cfbb4ce
fix comments 2
R-JunmingChen Aug 30, 2023
021223b
DEPRECATED
R-JunmingChen Aug 30, 2023
76b353a
Deprecated update
R-JunmingChen Aug 30, 2023
4338eac
roll back
R-JunmingChen Aug 30, 2023
e61278b
roll back
R-JunmingChen Aug 30, 2023
14fcc44
Update _compute.pyx
R-JunmingChen Aug 30, 2023
f26892a
Update scalar_set_lookup.cc
R-JunmingChen Sep 4, 2023
ef4d4b9
Update cpp/src/arrow/compute/api_scalar.h
R-JunmingChen Sep 20, 2023
9628bf2
Update cpp/src/arrow/compute/api_scalar.h
R-JunmingChen Sep 20, 2023
f057ca6
move comment
R-JunmingChen Sep 20, 2023
982c17f
Merge branch 'ARROW-36420' of https://github.com/R-JunmingChen/arrow …
R-JunmingChen Sep 20, 2023
abfe890
NotImplemented
R-JunmingChen Sep 20, 2023
2115e22
return
R-JunmingChen Sep 20, 2023
2352e42
add default test
R-JunmingChen Sep 20, 2023
e71fc27
add test for isin
R-JunmingChen Sep 20, 2023
d161639
add test for indexin
R-JunmingChen Sep 20, 2023
8cfe97c
CoercedDataMember
R-JunmingChen Sep 20, 2023
2c9fa9c
lint
R-JunmingChen Sep 20, 2023
1b09661
expression test
R-JunmingChen Sep 20, 2023
38c1a04
python expression
R-JunmingChen Sep 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
27 changes: 18 additions & 9 deletions c_glib/arrow-glib/compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3346,7 +3346,11 @@ garrow_set_lookup_options_get_property(GObject *object,
g_value_set_object(value, priv->value_set);
break;
case PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS:
g_value_set_boolean(value, options->skip_nulls);
if (!options->skip_nulls.has_value() || !options->skip_nulls.value()) {
g_value_set_boolean(value, false);
} else {
g_value_set_boolean(value, true);
}
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
Expand Down Expand Up @@ -3398,13 +3402,14 @@ garrow_set_lookup_options_class_init(GArrowSetLookupOptionsClass *klass)
*
* Since: 6.0.0
*/
spec = g_param_spec_boolean("skip-nulls",
"Skip NULLs",
"Whether NULLs are skipped or not",
options.skip_nulls,
static_cast<GParamFlags>(G_PARAM_READWRITE));
g_object_class_install_property(gobject_class,
PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS,
bool skip_nulls = false;
if (options.skip_nulls.has_value() && options.skip_nulls.value()) {
skip_nulls = true;
}
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
spec =
g_param_spec_boolean("skip-nulls", "Skip NULLs", "Whether NULLs are skipped or not",
skip_nulls, static_cast<GParamFlags>(G_PARAM_READWRITE));
g_object_class_install_property(gobject_class, PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS,
spec);
}

Expand Down Expand Up @@ -6458,9 +6463,13 @@ garrow_set_lookup_options_new_raw(
arrow_copied_options.get());
auto value_set =
garrow_datum_new_raw(&(arrow_copied_set_lookup_options->value_set));
bool skip_nulls = false;
if (arrow_options->skip_nulls.has_value() && arrow_options->skip_nulls.value()) {
skip_nulls = true;
}
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
auto options = g_object_new(GARROW_TYPE_SET_LOOKUP_OPTIONS,
"value-set", value_set,
"skip-nulls", arrow_options->skip_nulls,
"skip-nulls", skip_nulls,
NULL);
return GARROW_SET_LOOKUP_OPTIONS(options);
}
Expand Down
51 changes: 48 additions & 3 deletions cpp/src/arrow/compute/api_scalar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,29 @@ struct EnumTraits<compute::MapLookupOptions::Occurrence>
}
};

template <>
struct EnumTraits<compute::SetLookupOptions::NullMatchingBehavior>
: BasicEnumTraits<compute::SetLookupOptions::NullMatchingBehavior,
compute::SetLookupOptions::NullMatchingBehavior::MATCH,
compute::SetLookupOptions::NullMatchingBehavior::SKIP,
compute::SetLookupOptions::NullMatchingBehavior::EMIT_NULL,
compute::SetLookupOptions::NullMatchingBehavior::INCONCLUSIVE> {
static std::string name() { return "SetLookupOptions::NullMatchingBehavior"; }
static std::string value_name(compute::SetLookupOptions::NullMatchingBehavior value) {
switch (value) {
case compute::SetLookupOptions::NullMatchingBehavior::MATCH:
return "MATCH";
case compute::SetLookupOptions::NullMatchingBehavior::SKIP:
return "SKIP";
case compute::SetLookupOptions::NullMatchingBehavior::EMIT_NULL:
return "EMIT_NULL";
case compute::SetLookupOptions::NullMatchingBehavior::INCONCLUSIVE:
return "INCONCLUSIVE";
}
return "<INVALID>";
}
};

} // namespace internal

namespace compute {
Expand Down Expand Up @@ -344,7 +367,8 @@ static auto kRoundToMultipleOptionsType = GetFunctionOptionsType<RoundToMultiple
DataMember("round_mode", &RoundToMultipleOptions::round_mode));
static auto kSetLookupOptionsType = GetFunctionOptionsType<SetLookupOptions>(
DataMember("value_set", &SetLookupOptions::value_set),
DataMember("skip_nulls", &SetLookupOptions::skip_nulls));
DataMember("skip_nulls", &SetLookupOptions::skip_nulls),
DataMember("null_matching_behavior", &SetLookupOptions::null_matching_behavior));
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
static auto kSliceOptionsType = GetFunctionOptionsType<SliceOptions>(
DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop),
DataMember("step", &SliceOptions::step));
Expand Down Expand Up @@ -540,8 +564,29 @@ constexpr char RoundToMultipleOptions::kTypeName[];
SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls)
: FunctionOptions(internal::kSetLookupOptionsType),
value_set(std::move(value_set)),
skip_nulls(skip_nulls) {}
SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {}
skip_nulls(skip_nulls) {
if (skip_nulls) {
this->null_matching_behavior = SetLookupOptions::SKIP;
} else {
this->null_matching_behavior = SetLookupOptions::MATCH;
}
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
}
SetLookupOptions::SetLookupOptions(
Datum value_set, SetLookupOptions::NullMatchingBehavior null_matching_behavior)
: FunctionOptions(internal::kSetLookupOptionsType),
value_set(std::move(value_set)),
null_matching_behavior(std::move(null_matching_behavior)) {}
SetLookupOptions::SetLookupOptions()
: SetLookupOptions({}, SetLookupOptions::NullMatchingBehavior::MATCH) {}
SetLookupOptions::NullMatchingBehavior SetLookupOptions::getNullMatchingBehavior() const {
if (!this->skip_nulls.has_value()) {
return this->null_matching_behavior;
} else if (this->skip_nulls.value()) {
return SetLookupOptions::SKIP;
} else {
return SetLookupOptions::MATCH;
}
}
constexpr char SetLookupOptions::kTypeName[];

SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step)
Expand Down
14 changes: 12 additions & 2 deletions cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -268,19 +268,29 @@ class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions {
/// Options for IsIn and IndexIn functions
class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
public:
explicit SetLookupOptions(Datum value_set, bool skip_nulls = false);
enum NullMatchingBehavior { MATCH, SKIP, EMIT_NULL, INCONCLUSIVE };
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved

explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH);
explicit SetLookupOptions(Datum value_set, bool skip_nulls);
SetLookupOptions();
static constexpr char const kTypeName[] = "SetLookupOptions";

/// The set of values to look up input values into.
Datum value_set;

NullMatchingBehavior null_matching_behavior;

// DEPRECATED(will be removed after removing of skip_nulls)
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
NullMatchingBehavior getNullMatchingBehavior() const;
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved

// DEPRECATED(use null_matching_behavior instead)
/// Whether nulls in `value_set` count for lookup.
///
/// If true, any null in `value_set` is ignored and nulls in the input
/// produce null (IndexIn) or false (IsIn) values in the output.
/// If false, any null in `value_set` is successfully matched in
/// the input.
bool skip_nulls;
std::optional<bool> skip_nulls;
};

/// Options for struct_field function
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/compute/expression_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,8 @@ TEST(Expression, ToString) {
compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2]")});

EXPECT_EQ(in_12.ToString(),
"index_in(beta, {value_set=int32:[\n 1,\n 2\n], skip_nulls=false})");
"index_in(beta, {value_set=int32:[\n 1,\n 2\n], skip_nulls=nullopt, "
"null_matching_behavior=MATCH})");

EXPECT_EQ(and_(field_ref("a"), field_ref("b")).ToString(), "(a and b)");
EXPECT_EQ(or_(field_ref("a"), field_ref("b")).ToString(), "(a or b)");
Expand Down
92 changes: 67 additions & 25 deletions cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ struct SetLookupState : public SetLookupStateBase {
explicit SetLookupState(MemoryPool* pool) : memory_pool(pool) {}

Status Init(const SetLookupOptions& options) {
this->null_matching_behavior = options.getNullMatchingBehavior();
if (options.value_set.is_array()) {
const ArrayData& value_set = *options.value_set.array();
memo_index_to_value_index.reserve(value_set.length);
Expand All @@ -66,7 +67,8 @@ struct SetLookupState : public SetLookupStateBase {
} else {
return Status::Invalid("value_set should be an array or chunked array");
}
if (!options.skip_nulls && lookup_table->GetNull() >= 0) {
if (this->null_matching_behavior != SetLookupOptions::SKIP &&
lookup_table->GetNull() >= 0) {
null_index = memo_index_to_value_index[lookup_table->GetNull()];
}
value_set_type = options.value_set.type();
Expand Down Expand Up @@ -117,19 +119,23 @@ struct SetLookupState : public SetLookupStateBase {
// be mapped back to indices in the value_set.
std::vector<int32_t> memo_index_to_value_index;
int32_t null_index = -1;
SetLookupOptions::NullMatchingBehavior null_matching_behavior;
};

template <>
struct SetLookupState<NullType> : public SetLookupStateBase {
explicit SetLookupState(MemoryPool*) {}

Status Init(const SetLookupOptions& options) {
value_set_has_null = (options.value_set.length() > 0) && !options.skip_nulls;
Status Init(SetLookupOptions& options) {
null_matching_behavior = options.getNullMatchingBehavior();
value_set_has_null = (options.value_set.length() > 0) &&
this->null_matching_behavior != SetLookupOptions::SKIP;
value_set_type = null();
return Status::OK();
}

bool value_set_has_null;
SetLookupOptions::NullMatchingBehavior null_matching_behavior;
};

// TODO: Put this concept somewhere reusable
Expand Down Expand Up @@ -274,10 +280,12 @@ struct IndexInVisitor {
const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved

if (data.length != 0) {
// skip_nulls is honored for consistency with other types
bit_util::SetBitsTo(out_bitmap, out->offset, out->length, state.value_set_has_null);
bit_util::SetBitsTo(out_bitmap, out->offset, out->length,
state.null_matching_behavior == SetLookupOptions::MATCH &&
state.value_set_has_null);

// Set all values to 0, which will be unmasked only if null is in the value_set
// and null_matching_behavior is equal to MATCH
std::memset(out->GetValues<int32_t>(1), 0x00, out->length * sizeof(int32_t));
}
return Status::OK();
Expand Down Expand Up @@ -305,7 +313,8 @@ struct IndexInVisitor {
bitmap_writer.Next();
},
[&]() {
if (state.null_index != -1) {
if (state.null_index != -1 &&
state.null_matching_behavior == SetLookupOptions::MATCH) {
bitmap_writer.Set();

// value_set included null
Expand Down Expand Up @@ -379,49 +388,82 @@ Status ExecIndexIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
return IndexInVisitor(ctx, batch[0].array, out->array_span_mutable()).Execute();
}

// ----------------------------------------------------------------------

// IsIn writes the results into a preallocated boolean data bitmap
struct IsInVisitor {
KernelContext* ctx;
const ArraySpan& data;
ArraySpan* out;
uint8_t* out_boolean_bitmap;
uint8_t* out_null_bitmap;

IsInVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out)
: ctx(ctx), data(data), out(out) {}
: ctx(ctx),
data(data),
out(out),
out_boolean_bitmap(out->buffers[1].data),
out_null_bitmap(out->buffers[0].data) {}

Status Visit(const DataType& type) {
R-JunmingChen marked this conversation as resolved.
Show resolved Hide resolved
DCHECK_EQ(type.id(), Type::NA);
const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
// skip_nulls is honored for consistency with other types
bit_util::SetBitsTo(out->buffers[1].data, out->offset, out->length,
state.value_set_has_null);

if (state.null_matching_behavior == SetLookupOptions::MATCH &&
state.value_set_has_null) {
bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, true);
bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, true);
} else if (state.null_matching_behavior == SetLookupOptions::SKIP ||
(!state.value_set_has_null &&
state.null_matching_behavior == SetLookupOptions::MATCH)) {
bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, false);
bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, true);
} else {
bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, false);
}
return Status::OK();
}

template <typename Type>
Status ProcessIsIn(const SetLookupState<Type>& state, const ArraySpan& input) {
using T = typename GetViewType<Type>::T;
FirstTimeBitmapWriter writer(out->buffers[1].data, out->offset, out->length);
FirstTimeBitmapWriter writer_boolean(out_boolean_bitmap, out->offset, out->length);
FirstTimeBitmapWriter writer_null(out_null_bitmap, out->offset, out->length);
bool value_set_has_null = state.null_index != -1;
VisitArraySpanInline<Type>(
input,
[&](T v) {
if (state.lookup_table->Get(v) != -1) {
writer.Set();
} else {
writer.Clear();
if (state.lookup_table->Get(v) != -1) { // true
writer_boolean.Set();
writer_null.Set();
} else if (state.null_matching_behavior == SetLookupOptions::INCONCLUSIVE &&
value_set_has_null) { // null
writer_boolean.Clear();
writer_null.Clear();
} else { // false
writer_boolean.Clear();
writer_null.Set();
}
writer.Next();
writer_boolean.Next();
writer_null.Next();
},
[&]() {
if (state.null_index != -1) {
writer.Set();
} else {
writer.Clear();
if (state.null_matching_behavior == SetLookupOptions::MATCH &&
value_set_has_null) { // true
writer_boolean.Set();
writer_null.Set();
} else if (state.null_matching_behavior == SetLookupOptions::SKIP ||
(!value_set_has_null && state.null_matching_behavior ==
SetLookupOptions::MATCH)) { // false
writer_boolean.Clear();
writer_null.Set();
} else { // null
writer_boolean.Clear();
writer_null.Clear();
}
writer.Next();
writer_boolean.Next();
writer_null.Next();
});
writer.Finish();
writer_boolean.Finish();
writer_null.Finish();
return Status::OK();
}

Expand Down Expand Up @@ -598,7 +640,7 @@ void RegisterScalarSetLookup(FunctionRegistry* registry) {
ScalarKernel isin_base;
isin_base.init = InitSetLookup;
isin_base.exec = ExecIsIn;
isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL;
isin_base.null_handling = NullHandling::COMPUTED_PREALLOCATE;
auto is_in = std::make_shared<SetLookupFunction>("is_in", Arity::Unary(), is_in_doc);

AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), is_in.get());
Expand Down