From 692bf1b3efcc895ddcdd2e0d7b60301d4dd61a23 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 18 Jul 2023 09:11:04 +0800 Subject: [PATCH 01/43] In Function Init --- cpp/src/arrow/compute/api_scalar.cc | 9 + cpp/src/arrow/compute/api_scalar.h | 23 +++ .../compute/kernels/scalar_set_lookup.cc | 167 ++++++++++++++++++ 3 files changed, 199 insertions(+) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index ae7e82fb2f9e4..9551d052e4832 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -745,6 +745,15 @@ Result MinElementWise(const std::vector& args, // ---------------------------------------------------------------------- // Set-related operations +Result In(const Datum& values, const SetLookupOptions& options, + ExecContext* ctx) { + return CallFunction("in", {values}, &options, ctx); +} + +Result In(const Datum& values, const Datum& value_set, ExecContext* ctx) { + return In(values, SetLookupOptions{value_set}, ctx); +} + Result IsIn(const Datum& values, const SetLookupOptions& options, ExecContext* ctx) { return CallFunction("is_in", {values}, &options, ctx); diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 10a2b4bffde6d..e571553d7503b 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -1089,6 +1089,29 @@ ARROW_EXPORT Result KleeneAndNot(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); +/// \brief In returns true for each element of `values` that is contained in +/// `value_set`. +/// In is sql-compatible, null in `values` will directly output null, +/// each elelement of `values` that isn't contained in `value_set` +/// will output null if the `value_set` contains null and output +/// false if the `value_set` doesn't contains null. +/// +/// In ignore the parameter skip_nulls in SetLookupOptions. +/// +/// \param[in] values array-like input to look up in value_set +/// \param[in] options SetLookupOptions +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 1.2.0.1 +/// \note API not yet finalized +ARROW_EXPORT +Result In(const Datum& values, const SetLookupOptions& options, + ExecContext* ctx = NULLPTR); +ARROW_EXPORT +Result In(const Datum& values, const Datum& value_set, + ExecContext* ctx = NULLPTR); + /// \brief IsIn returns true for each element of `values` that is contained in /// `value_set` /// diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index 00d391653d240..be89ba02924bd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -380,6 +380,120 @@ Status ExecIndexIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { } // ---------------------------------------------------------------------- +// In writes the results into a preallocated boolean data bitmap +struct InVisitor { + KernelContext* ctx; + const ArraySpan& data; + ArraySpan* out; + uint8_t* out_boolean_bitmap; + uint8_t* out_null_bitmap; + + InVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out) + : ctx(ctx), + data(data), + out(out), + out_boolean_bitmap(out->buffers[1].data), + out_null_bitmap(out->buffers[0].data) {} + + Status Visit(const DataType& type) { + DCHECK_EQ(type.id(), Type::NA); + // skip_nulls is ignored in sql-compatible In + bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, false); + bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, true); + + return Status::OK(); + } + + template + Status ProcessIsIn(const SetLookupState& state, const ArraySpan& input) { + using T = typename GetViewType::T; + FirstTimeBitmapWriter writer_boolean(out_boolean_bitmap, out->offset, out->length); + FirstTimeBitmapWriter writer_null(out_null_bitmap, out->offset, out->length); + bool value_set_has_null = state.null_index != -1; + VisitArraySpanInline( + input, + [&](T v) { + if (state.lookup_table->Get(v) != -1) { + writer_boolean.Set(); + writer_null.Clear(); + } else if (value_set_has_null) { + writer_boolean.Clear(); + writer_null.Set(); + } else { + writer_boolean.Clear(); + writer_null.Clear(); + } + writer_boolean.Next(); + writer_null.Next(); + }, + [&]() { + writer_boolean.Clear(); + writer_null.Set(); + writer_boolean.Next(); + writer_null.Next(); + }); + writer_boolean.Finish(); + writer_null.Finish(); + return Status::OK(); + } + + template + Status ProcessIsIn() { + const auto& state = checked_cast&>(*ctx->state()); + + if (!data.type->Equals(state.value_set_type)) { + auto materialized_input = data.ToArrayData(); + auto cast_result = Cast(*materialized_input, state.value_set_type, + CastOptions::Safe(), ctx->exec_context()); + if (ARROW_PREDICT_FALSE(!cast_result.ok())) { + if (cast_result.status().IsNotImplemented()) { + return Status::TypeError("Array type doesn't match type of values set: ", + *data.type, " vs ", *state.value_set_type); + } + return cast_result.status(); + } + auto casted_input = *cast_result; + return ProcessIsIn(state, *casted_input.array()); + } + return ProcessIsIn(state, data); + } + + template + enable_if_boolean Visit(const Type&) { + return ProcessIsIn(); + } + + template + enable_if_t::value && !is_boolean_type::value && + !std::is_same::value, + Status> + Visit(const Type&) { + return ProcessIsIn::Type>(); + } + + template + enable_if_base_binary Visit(const Type&) { + return ProcessIsIn(); + } + + // Handle Decimal128Type, FixedSizeBinaryType + Status Visit(const FixedSizeBinaryType& type) { + return ProcessIsIn(); + } + + Status Visit(const MonthDayNanoIntervalType& type) { + return ProcessIsIn(); + } + + Status Execute() { + const auto& state = checked_cast(*ctx->state()); + return VisitTypeInline(*state.value_set_type, this); + } +}; + +Status ExecIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return InVisitor(ctx, batch[0].array, out->array_span_mutable()).Execute(); +} // IsIn writes the results into a preallocated boolean data bitmap struct IsInVisitor { @@ -516,6 +630,28 @@ void AddBasicSetLookupKernels(ScalarKernel kernel, } } +const FunctionDoc in_doc{ + "Find each element in a set of values in sql-compatible way", + ("For each element in `values`, return true if it is found in a given\n" + "set of values. null in `values` will directly return null.\n" + "each elelement of `values` that isn't contained in the set of values\n" + "will return null if the the set of values contains null and return\n" + "false if the the set of values doesn't contains null.\n" + "The set of values to look for must be given in SetLookupOptions.\n" + "the parameter skip_nulls in SetLookupOptions is ignored in this Function"), + {"values"}, + "SetLookupOptions", + /*options_required=*/true}; + +const FunctionDoc in_meta_doc{ + "Find each element in a set of values in sql-compatible way", + ("For each element in `values`, return true if it is found in `value_set`,\n" + "null in `values` will directly return null.\n" + "each elelement of `values` that isn't contained in `value_set`\n" + "will return null if the `value_set` contains null and return\n" + "false if the `value_set` doesn't contains null."), + {"values", "value_set"}}; + const FunctionDoc is_in_doc{ "Find each element in a set of values", ("For each element in `values`, return true if it is found in a given\n" @@ -550,6 +686,20 @@ const FunctionDoc index_in_meta_doc{ "or null if it is not found there."), {"values", "value_set"}}; +class InMetaBinary : public MetaFunction { + public: + InMetaBinary() : MetaFunction("in_meta_binary", Arity::Binary(), in_meta_doc) {} + + Result ExecuteImpl(const std::vector& args, + const FunctionOptions* options, + ExecContext* ctx) const override { + if (options != nullptr) { + return Status::Invalid("Unexpected options for 'in_meta_binary' function"); + } + return In(args[0], args[1], ctx); + } +}; + // Enables calling is_in with CallFunction as though it were binary. class IsInMetaBinary : public MetaFunction { public: @@ -593,6 +743,23 @@ struct SetLookupFunction : ScalarFunction { } // namespace void RegisterScalarSetLookup(FunctionRegistry* registry) { + // In writes its boolean output into preallocated memory + { + ScalarKernel in_base; + in_base.init = InitSetLookup; + in_base.exec = ExecIn; + in_base.null_handling = NullHandling::COMPUTED_PREALLOCATE; + auto in = std::make_shared("in", Arity::Unary(), in_doc); + + AddBasicSetLookupKernels(in_base, /*output_type=*/boolean(), in.get()); + + in_base.signature = KernelSignature::Make({null()}, boolean()); + DCHECK_OK(in->AddKernel(in_base)); + DCHECK_OK(registry->AddFunction(in)); + + DCHECK_OK(registry->AddFunction(std::make_shared())); + } + // IsIn writes its boolean output into preallocated memory { ScalarKernel isin_base; From 7098b4f7e8be5db25f061c7bc59fe70bea626594 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 18 Jul 2023 10:59:04 +0800 Subject: [PATCH 02/43] Update api_scalar.cc --- cpp/src/arrow/compute/api_scalar.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 9551d052e4832..98ad9c08900e7 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -745,8 +745,7 @@ Result MinElementWise(const std::vector& args, // ---------------------------------------------------------------------- // Set-related operations -Result In(const Datum& values, const SetLookupOptions& options, - ExecContext* ctx) { +Result In(const Datum& values, const SetLookupOptions& options, ExecContext* ctx) { return CallFunction("in", {values}, &options, ctx); } From 19798a14669e129dd61be5939e51edae16f2bc4b Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 18 Jul 2023 11:04:49 +0800 Subject: [PATCH 03/43] Update api_scalar.h --- cpp/src/arrow/compute/api_scalar.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index e571553d7503b..43ed8393d989b 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -1094,7 +1094,7 @@ Result KleeneAndNot(const Datum& left, const Datum& right, /// In is sql-compatible, null in `values` will directly output null, /// each elelement of `values` that isn't contained in `value_set` /// will output null if the `value_set` contains null and output -/// false if the `value_set` doesn't contains null. +/// false if the `value_set` doesn't contain null. /// /// In ignore the parameter skip_nulls in SetLookupOptions. /// @@ -1103,14 +1103,13 @@ Result KleeneAndNot(const Datum& left, const Datum& right, /// \param[in] ctx the function execution context, optional /// \return the resulting datum /// -/// \since 1.2.0.1 +/// \since 12.0.1 /// \note API not yet finalized ARROW_EXPORT Result In(const Datum& values, const SetLookupOptions& options, - ExecContext* ctx = NULLPTR); + ExecContext* ctx = NULLPTR); ARROW_EXPORT -Result In(const Datum& values, const Datum& value_set, - ExecContext* ctx = NULLPTR); +Result In(const Datum& values, const Datum& value_set, ExecContext* ctx = NULLPTR); /// \brief IsIn returns true for each element of `values` that is contained in /// `value_set` From 30ac5fe93dbe90004b5d4f4e765eb2375326e6ee Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 18 Jul 2023 11:09:03 +0800 Subject: [PATCH 04/43] Update scalar_set_lookup.cc --- cpp/src/arrow/compute/kernels/scalar_set_lookup.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index be89ba02924bd..e100fc7b2677b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -631,12 +631,12 @@ void AddBasicSetLookupKernels(ScalarKernel kernel, } const FunctionDoc in_doc{ - "Find each element in a set of values in sql-compatible way", + "Find each element in a set of values in a sql-compatible way", ("For each element in `values`, return true if it is found in a given\n" "set of values. null in `values` will directly return null.\n" "each elelement of `values` that isn't contained in the set of values\n" "will return null if the the set of values contains null and return\n" - "false if the the set of values doesn't contains null.\n" + "false if the the set of values doesn't contain null.\n" "The set of values to look for must be given in SetLookupOptions.\n" "the parameter skip_nulls in SetLookupOptions is ignored in this Function"), {"values"}, @@ -644,11 +644,11 @@ const FunctionDoc in_doc{ /*options_required=*/true}; const FunctionDoc in_meta_doc{ - "Find each element in a set of values in sql-compatible way", + "Find each element in a set of values in a sql-compatible way", ("For each element in `values`, return true if it is found in `value_set`,\n" "null in `values` will directly return null.\n" "each elelement of `values` that isn't contained in `value_set`\n" - "will return null if the `value_set` contains null and return\n" + "will return null if the `value_set` contain null and return\n" "false if the `value_set` doesn't contains null."), {"values", "value_set"}}; From 3ef86f55f44fd3a399b903880236b868afa81311 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 20 Jul 2023 00:13:21 +0800 Subject: [PATCH 05/43] fix bug --- cpp/src/arrow/compute/kernels/scalar_set_lookup.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index be89ba02924bd..06c673ef43262 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -415,20 +415,20 @@ struct InVisitor { [&](T v) { if (state.lookup_table->Get(v) != -1) { writer_boolean.Set(); - writer_null.Clear(); + writer_null.Set(); } else if (value_set_has_null) { writer_boolean.Clear(); - writer_null.Set(); + writer_null.Clear(); } else { writer_boolean.Clear(); - writer_null.Clear(); + writer_null.Set(); } writer_boolean.Next(); writer_null.Next(); }, [&]() { writer_boolean.Clear(); - writer_null.Set(); + writer_null.Clear(); writer_boolean.Next(); writer_null.Next(); }); From 60bbb1468adb75bd13d3800f1f3735608146afc6 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 20 Jul 2023 00:17:15 +0800 Subject: [PATCH 06/43] fix bug2 --- cpp/src/arrow/compute/kernels/scalar_set_lookup.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index 3d303e08aa7b6..f92542ed16da7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -399,7 +399,7 @@ struct InVisitor { DCHECK_EQ(type.id(), Type::NA); // skip_nulls is ignored in sql-compatible In bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, false); - bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, true); + bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, false); return Status::OK(); } From b79a4dc466d18e452224e8379e024c51fc9fad9c Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Fri, 18 Aug 2023 10:24:30 +0800 Subject: [PATCH 07/43] isin init --- cpp/src/arrow/compute/api_scalar.cc | 36 ++- cpp/src/arrow/compute/api_scalar.h | 37 +-- .../compute/kernels/scalar_set_lookup.cc | 214 ++++-------------- 3 files changed, 80 insertions(+), 207 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 98ad9c08900e7..dc98201ee5c38 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -344,7 +344,8 @@ static auto kRoundToMultipleOptionsType = GetFunctionOptionsType( DataMember("value_set", &SetLookupOptions::value_set), - DataMember("skip_nulls", &SetLookupOptions::skip_nulls)); + DataMember("skip_nulls", &SetLookupOptions::skip_nulls), + DataMember("null_matching_behavior", &SetLookupOptions::null_matching_behavior)); static auto kSliceOptionsType = GetFunctionOptionsType( DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop), DataMember("step", &SliceOptions::step)); @@ -540,8 +541,29 @@ constexpr char RoundToMultipleOptions::kTypeName[]; SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls) : FunctionOptions(internal::kSetLookupOptionsType), value_set(std::move(value_set)), - skip_nulls(skip_nulls) {} -SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {} + skip_nulls(skip_nulls) { + if (skip_nulls) { + this->null_matching_behavior = SetLookupOptions::NullMatchingBehavior::SKIP; + } else { + this->null_matching_behavior = SetLookupOptions::NullMatchingBehavior::MATCH; + } +} +SetLookupOptions::SetLookupOptions( + Datum value_set, SetLookupOptions::NullMatchingBehavior null_matching_behavior) + : FunctionOptions(internal::kSetLookupOptionsType), + value_set(std::move(value_set)), + null_matching_behavior(std::move(null_matching_behavior)) {} +SetLookupOptions::SetLookupOptions() + : SetLookupOptions({}, SetLookupOptions::NullMatchingBehavior::MATCH) {} +SetLookupOptions::NullMatchingBehavior SetLookupOptions::getNullMatchingBehavior() { + if (this->skip_nulls == std::nullopt) { + return this->null_matching_behavior; + } else if (this->skip_nulls) { + return SetLookupOptions::NullMatchingBehavior::SKIP; + } else { + return SetLookupOptions::NullMatchingBehavior::MATCH; + } +} constexpr char SetLookupOptions::kTypeName[]; SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step) @@ -745,14 +767,6 @@ Result MinElementWise(const std::vector& args, // ---------------------------------------------------------------------- // Set-related operations -Result In(const Datum& values, const SetLookupOptions& options, ExecContext* ctx) { - return CallFunction("in", {values}, &options, ctx); -} - -Result In(const Datum& values, const Datum& value_set, ExecContext* ctx) { - return In(values, SetLookupOptions{value_set}, ctx); -} - Result IsIn(const Datum& values, const SetLookupOptions& options, ExecContext* ctx) { return CallFunction("is_in", {values}, &options, ctx); diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 43ed8393d989b..2bffb3542e86e 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -268,19 +268,28 @@ class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions { /// Options for IsIn and IndexIn functions class ARROW_EXPORT SetLookupOptions : public FunctionOptions { public: - explicit SetLookupOptions(Datum value_set, bool skip_nulls = false); - SetLookupOptions(); + enum NullMatchingBehavior { MATCH, SKIP, EMIT_NULL, INCONCLUSIVE }; + + explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH); + explicit SetLookupOptions(Datum value_set, bool skip_nulls); SetLookupOptions(); static constexpr char const kTypeName[] = "SetLookupOptions"; /// The set of values to look up input values into. Datum value_set; + + NullMatchingBehavior null_matching_behavior; + + // DEPRECATED(will be removed after removing of skip_nulls) + NullMatchingBehavior const getNullMatchingBehavior(); + + // DEPRECATED(use null_matching_behavior instead) /// Whether nulls in `value_set` count for lookup. /// /// If true, any null in `value_set` is ignored and nulls in the input /// produce null (IndexIn) or false (IsIn) values in the output. /// If false, any null in `value_set` is successfully matched in /// the input. - bool skip_nulls; + std::optional skip_nulls; }; /// Options for struct_field function @@ -1089,28 +1098,6 @@ ARROW_EXPORT Result KleeneAndNot(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); -/// \brief In returns true for each element of `values` that is contained in -/// `value_set`. -/// In is sql-compatible, null in `values` will directly output null, -/// each elelement of `values` that isn't contained in `value_set` -/// will output null if the `value_set` contains null and output -/// false if the `value_set` doesn't contain null. -/// -/// In ignore the parameter skip_nulls in SetLookupOptions. -/// -/// \param[in] values array-like input to look up in value_set -/// \param[in] options SetLookupOptions -/// \param[in] ctx the function execution context, optional -/// \return the resulting datum -/// -/// \since 12.0.1 -/// \note API not yet finalized -ARROW_EXPORT -Result In(const Datum& values, const SetLookupOptions& options, - ExecContext* ctx = NULLPTR); -ARROW_EXPORT -Result In(const Datum& values, const Datum& value_set, ExecContext* ctx = NULLPTR); - /// \brief IsIn returns true for each element of `values` that is contained in /// `value_set` /// diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index f92542ed16da7..fc2d1319e1f4d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -44,6 +44,7 @@ struct SetLookupState : public SetLookupStateBase { explicit SetLookupState(MemoryPool* pool) : memory_pool(pool) {} Status Init(const SetLookupOptions& options) { + this->null_matching_behavior = options.getNullMatchingBehavior(); if (options.value_set.is_array()) { const ArrayData& value_set = *options.value_set.array(); memo_index_to_value_index.reserve(value_set.length); @@ -66,7 +67,8 @@ struct SetLookupState : public SetLookupStateBase { } else { return Status::Invalid("value_set should be an array or chunked array"); } - if (!options.skip_nulls && lookup_table->GetNull() >= 0) { + if (this->null_matching_behavior != SetLookupOptions::SKIP && + lookup_table->GetNull() >= 0) { null_index = memo_index_to_value_index[lookup_table->GetNull()]; } value_set_type = options.value_set.type(); @@ -117,19 +119,23 @@ struct SetLookupState : public SetLookupStateBase { // be mapped back to indices in the value_set. std::vector memo_index_to_value_index; int32_t null_index = -1; + SetLookupOptions::NullMatchingBehavior null_matching_behavior; }; template <> struct SetLookupState : public SetLookupStateBase { explicit SetLookupState(MemoryPool*) {} - Status Init(const SetLookupOptions& options) { - value_set_has_null = (options.value_set.length() > 0) && !options.skip_nulls; + Status Init(SetLookupOptions& options) { + null_matching_behavior = options.getNullMatchingBehavior(); + value_set_has_null = (options.value_set.length() > 0) && + this->null_matching_behavior != SetLookupOptions::SKIP; value_set_type = null(); return Status::OK(); } bool value_set_has_null; + SetLookupOptions::NullMatchingBehavior null_matching_behavior; }; // TODO: Put this concept somewhere reusable @@ -379,16 +385,15 @@ Status ExecIndexIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { return IndexInVisitor(ctx, batch[0].array, out->array_span_mutable()).Execute(); } -// ---------------------------------------------------------------------- -// In writes the results into a preallocated boolean data bitmap -struct InVisitor { +// IsIn writes the results into a preallocated boolean data bitmap +struct IsInVisitor { KernelContext* ctx; const ArraySpan& data; ArraySpan* out; uint8_t* out_boolean_bitmap; uint8_t* out_null_bitmap; - InVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out) + IsInVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out) : ctx(ctx), data(data), out(out), @@ -397,10 +402,20 @@ struct InVisitor { Status Visit(const DataType& type) { DCHECK_EQ(type.id(), Type::NA); - // skip_nulls is ignored in sql-compatible In - bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, false); - bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, false); + const auto& state = checked_cast&>(*ctx->state()); + if (state.null_matching_behavior == SetLookupOptions::MATCH && + state.value_set_has_null) { + bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, true); + bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, true); + } else if (state.null_matching_behavior == SetLookupOptions::SKIP || + (!state.value_set_has_null && + state.null_matching_behavior == SetLookupOptions::MATCH)) { + bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, false); + bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, true); + } else { + bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, false); + } return Status::OK(); } @@ -413,13 +428,14 @@ struct InVisitor { VisitArraySpanInline( input, [&](T v) { - if (state.lookup_table->Get(v) != -1) { + if (state.lookup_table->Get(v) != -1) { // true writer_boolean.Set(); writer_null.Set(); - } else if (value_set_has_null) { + } else if (state.null_matching_behavior == SetLookupOptions::INCONCLUSIVE && + value_set_has_null) { // null writer_boolean.Clear(); writer_null.Clear(); - } else { + } else { // false writer_boolean.Clear(); writer_null.Set(); } @@ -427,8 +443,19 @@ struct InVisitor { writer_null.Next(); }, [&]() { - writer_boolean.Clear(); - writer_null.Clear(); + if (state.null_matching_behavior == SetLookupOptions::MATCH && + value_set_has_null) { // true + writer_boolean.Set(); + writer_null.Set(); + } else if (state.null_matching_behavior == SetLookupOptions::SKIP || + (!value_set_has_null && state.null_matching_behavior == + SetLookupOptions::MATCH)) { // false + writer_boolean.Clear(); + writer_null.Set(); + } else { // null + writer_boolean.Clear(); + writer_null.Clear(); + } writer_boolean.Next(); writer_null.Next(); }); @@ -491,108 +518,6 @@ struct InVisitor { } }; -Status ExecIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - return InVisitor(ctx, batch[0].array, out->array_span_mutable()).Execute(); -} - -// IsIn writes the results into a preallocated boolean data bitmap -struct IsInVisitor { - KernelContext* ctx; - const ArraySpan& data; - ArraySpan* out; - - IsInVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out) - : ctx(ctx), data(data), out(out) {} - - Status Visit(const DataType& type) { - DCHECK_EQ(type.id(), Type::NA); - const auto& state = checked_cast&>(*ctx->state()); - // skip_nulls is honored for consistency with other types - bit_util::SetBitsTo(out->buffers[1].data, out->offset, out->length, - state.value_set_has_null); - return Status::OK(); - } - - template - Status ProcessIsIn(const SetLookupState& state, const ArraySpan& input) { - using T = typename GetViewType::T; - FirstTimeBitmapWriter writer(out->buffers[1].data, out->offset, out->length); - VisitArraySpanInline( - input, - [&](T v) { - if (state.lookup_table->Get(v) != -1) { - writer.Set(); - } else { - writer.Clear(); - } - writer.Next(); - }, - [&]() { - if (state.null_index != -1) { - writer.Set(); - } else { - writer.Clear(); - } - writer.Next(); - }); - writer.Finish(); - return Status::OK(); - } - - template - Status ProcessIsIn() { - const auto& state = checked_cast&>(*ctx->state()); - - if (!data.type->Equals(state.value_set_type)) { - auto materialized_input = data.ToArrayData(); - auto cast_result = Cast(*materialized_input, state.value_set_type, - CastOptions::Safe(), ctx->exec_context()); - if (ARROW_PREDICT_FALSE(!cast_result.ok())) { - if (cast_result.status().IsNotImplemented()) { - return Status::TypeError("Array type doesn't match type of values set: ", - *data.type, " vs ", *state.value_set_type); - } - return cast_result.status(); - } - auto casted_input = *cast_result; - return ProcessIsIn(state, *casted_input.array()); - } - return ProcessIsIn(state, data); - } - - template - enable_if_boolean Visit(const Type&) { - return ProcessIsIn(); - } - - template - enable_if_t::value && !is_boolean_type::value && - !std::is_same::value, - Status> - Visit(const Type&) { - return ProcessIsIn::Type>(); - } - - template - enable_if_base_binary Visit(const Type&) { - return ProcessIsIn(); - } - - // Handle Decimal128Type, FixedSizeBinaryType - Status Visit(const FixedSizeBinaryType& type) { - return ProcessIsIn(); - } - - Status Visit(const MonthDayNanoIntervalType& type) { - return ProcessIsIn(); - } - - Status Execute() { - const auto& state = checked_cast(*ctx->state()); - return VisitTypeInline(*state.value_set_type, this); - } -}; - Status ExecIsIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { return IsInVisitor(ctx, batch[0].array, out->array_span_mutable()).Execute(); } @@ -630,28 +555,6 @@ void AddBasicSetLookupKernels(ScalarKernel kernel, } } -const FunctionDoc in_doc{ - "Find each element in a set of values in a sql-compatible way", - ("For each element in `values`, return true if it is found in a given\n" - "set of values. null in `values` will directly return null.\n" - "each elelement of `values` that isn't contained in the set of values\n" - "will return null if the the set of values contains null and return\n" - "false if the the set of values doesn't contain null.\n" - "The set of values to look for must be given in SetLookupOptions.\n" - "the parameter skip_nulls in SetLookupOptions is ignored in this Function"), - {"values"}, - "SetLookupOptions", - /*options_required=*/true}; - -const FunctionDoc in_meta_doc{ - "Find each element in a set of values in a sql-compatible way", - ("For each element in `values`, return true if it is found in `value_set`,\n" - "null in `values` will directly return null.\n" - "each elelement of `values` that isn't contained in `value_set`\n" - "will return null if the `value_set` contain null and return\n" - "false if the `value_set` doesn't contains null."), - {"values", "value_set"}}; - const FunctionDoc is_in_doc{ "Find each element in a set of values", ("For each element in `values`, return true if it is found in a given\n" @@ -686,20 +589,6 @@ const FunctionDoc index_in_meta_doc{ "or null if it is not found there."), {"values", "value_set"}}; -class InMetaBinary : public MetaFunction { - public: - InMetaBinary() : MetaFunction("in_meta_binary", Arity::Binary(), in_meta_doc) {} - - Result ExecuteImpl(const std::vector& args, - const FunctionOptions* options, - ExecContext* ctx) const override { - if (options != nullptr) { - return Status::Invalid("Unexpected options for 'in_meta_binary' function"); - } - return In(args[0], args[1], ctx); - } -}; - // Enables calling is_in with CallFunction as though it were binary. class IsInMetaBinary : public MetaFunction { public: @@ -743,29 +632,12 @@ struct SetLookupFunction : ScalarFunction { } // namespace void RegisterScalarSetLookup(FunctionRegistry* registry) { - // In writes its boolean output into preallocated memory - { - ScalarKernel in_base; - in_base.init = InitSetLookup; - in_base.exec = ExecIn; - in_base.null_handling = NullHandling::COMPUTED_PREALLOCATE; - auto in = std::make_shared("in", Arity::Unary(), in_doc); - - AddBasicSetLookupKernels(in_base, /*output_type=*/boolean(), in.get()); - - in_base.signature = KernelSignature::Make({null()}, boolean()); - DCHECK_OK(in->AddKernel(in_base)); - DCHECK_OK(registry->AddFunction(in)); - - DCHECK_OK(registry->AddFunction(std::make_shared())); - } - // IsIn writes its boolean output into preallocated memory { ScalarKernel isin_base; isin_base.init = InitSetLookup; isin_base.exec = ExecIsIn; - isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL; + isin_base.null_handling = NullHandling::COMPUTED_PREALLOCATE; auto is_in = std::make_shared("is_in", Arity::Unary(), is_in_doc); AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), is_in.get()); From 5be997c235fc42399ea90e1f19979a36230f8693 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Fri, 18 Aug 2023 22:13:07 +0800 Subject: [PATCH 08/43] ARROW-36420 --- cpp/src/arrow/compute/api_scalar.cc | 29 +++++++++++++++++-- cpp/src/arrow/compute/api_scalar.h | 9 +++--- .../compute/kernels/scalar_set_lookup.cc | 9 ++++-- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index dc98201ee5c38..f350e5ddac4c0 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -275,6 +275,29 @@ struct EnumTraits } }; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "SetLookupOptions::NullMatchingBehavior"; } + static std::string value_name(compute::SetLookupOptions::NullMatchingBehavior value) { + switch (value) { + case compute::SetLookupOptions::NullMatchingBehavior::MATCH: + return "MATCH"; + case compute::SetLookupOptions::NullMatchingBehavior::SKIP: + return "SKIP"; + case compute::SetLookupOptions::NullMatchingBehavior::EMIT_NULL: + return "EMIT_NULL"; + case compute::SetLookupOptions::NullMatchingBehavior::INCONCLUSIVE: + return "INCONCLUSIVE"; + } + return ""; + } +}; + } // namespace internal namespace compute { @@ -555,13 +578,13 @@ SetLookupOptions::SetLookupOptions( null_matching_behavior(std::move(null_matching_behavior)) {} SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, SetLookupOptions::NullMatchingBehavior::MATCH) {} -SetLookupOptions::NullMatchingBehavior SetLookupOptions::getNullMatchingBehavior() { +SetLookupOptions::NullMatchingBehavior SetLookupOptions::getNullMatchingBehavior() const { if (this->skip_nulls == std::nullopt) { return this->null_matching_behavior; } else if (this->skip_nulls) { - return SetLookupOptions::NullMatchingBehavior::SKIP; + return SetLookupOptions::SKIP; } else { - return SetLookupOptions::NullMatchingBehavior::MATCH; + return SetLookupOptions::MATCH; } } constexpr char SetLookupOptions::kTypeName[]; diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 2bffb3542e86e..4b5810841f2bf 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -271,17 +271,18 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { enum NullMatchingBehavior { MATCH, SKIP, EMIT_NULL, INCONCLUSIVE }; explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH); - explicit SetLookupOptions(Datum value_set, bool skip_nulls); SetLookupOptions(); + explicit SetLookupOptions(Datum value_set, bool skip_nulls); + SetLookupOptions(); static constexpr char const kTypeName[] = "SetLookupOptions"; /// The set of values to look up input values into. Datum value_set; NullMatchingBehavior null_matching_behavior; - + // DEPRECATED(will be removed after removing of skip_nulls) - NullMatchingBehavior const getNullMatchingBehavior(); - + NullMatchingBehavior getNullMatchingBehavior() const; + // DEPRECATED(use null_matching_behavior instead) /// Whether nulls in `value_set` count for lookup. /// diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index fc2d1319e1f4d..d5cf258a267f5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -280,10 +280,12 @@ struct IndexInVisitor { const auto& state = checked_cast&>(*ctx->state()); if (data.length != 0) { - // skip_nulls is honored for consistency with other types - bit_util::SetBitsTo(out_bitmap, out->offset, out->length, state.value_set_has_null); + bit_util::SetBitsTo(out_bitmap, out->offset, out->length, + state.null_matching_behavior == SetLookupOptions::MATCH && + state.value_set_has_null); // Set all values to 0, which will be unmasked only if null is in the value_set + // and null_matching_behavior is equal to MATCH std::memset(out->GetValues(1), 0x00, out->length * sizeof(int32_t)); } return Status::OK(); @@ -311,7 +313,8 @@ struct IndexInVisitor { bitmap_writer.Next(); }, [&]() { - if (state.null_index != -1) { + if (state.null_index != -1 && + state.null_matching_behavior == SetLookupOptions::MATCH) { bitmap_writer.Set(); // value_set included null From 52e1dfbffcf72ec44b64ab84d182346444eebe75 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 20 Aug 2023 18:25:13 +0800 Subject: [PATCH 09/43] fix bug --- cpp/src/arrow/compute/api_scalar.cc | 36 +++--------------------- cpp/src/arrow/compute/expression_test.cc | 2 +- 2 files changed, 5 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index f350e5ddac4c0..7b160b9f97a9a 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -566,9 +566,9 @@ SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls) value_set(std::move(value_set)), skip_nulls(skip_nulls) { if (skip_nulls) { - this->null_matching_behavior = SetLookupOptions::NullMatchingBehavior::SKIP; + this->null_matching_behavior = SetLookupOptions::SKIP; } else { - this->null_matching_behavior = SetLookupOptions::NullMatchingBehavior::MATCH; + this->null_matching_behavior = SetLookupOptions::MATCH; } } SetLookupOptions::SetLookupOptions( @@ -579,9 +579,9 @@ SetLookupOptions::SetLookupOptions( SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, SetLookupOptions::NullMatchingBehavior::MATCH) {} SetLookupOptions::NullMatchingBehavior SetLookupOptions::getNullMatchingBehavior() const { - if (this->skip_nulls == std::nullopt) { + if (!this->skip_nulls.has_value()) { return this->null_matching_behavior; - } else if (this->skip_nulls) { + } else if (this->skip_nulls.value()) { return SetLookupOptions::SKIP; } else { return SetLookupOptions::MATCH; @@ -820,34 +820,6 @@ SCALAR_EAGER_BINARY(Or, "or") SCALAR_EAGER_BINARY(Xor, "xor") SCALAR_EAGER_UNARY(Invert, "invert") -// ---------------------------------------------------------------------- - -Result Compare(const Datum& left, const Datum& right, CompareOptions options, - ExecContext* ctx) { - std::string func_name; - switch (options.op) { - case CompareOperator::EQUAL: - func_name = "equal"; - break; - case CompareOperator::NOT_EQUAL: - func_name = "not_equal"; - break; - case CompareOperator::GREATER: - func_name = "greater"; - break; - case CompareOperator::GREATER_EQUAL: - func_name = "greater_equal"; - break; - case CompareOperator::LESS: - func_name = "less"; - break; - case CompareOperator::LESS_EQUAL: - func_name = "less_equal"; - break; - } - return CallFunction(func_name, {left, right}, nullptr, ctx); -} - // ---------------------------------------------------------------------- // Validity functions diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index f90e01a2f81bc..898c511afa1f7 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -263,7 +263,7 @@ TEST(Expression, ToString) { compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2]")}); EXPECT_EQ(in_12.ToString(), - "index_in(beta, {value_set=int32:[\n 1,\n 2\n], skip_nulls=false})"); + "index_in(beta, {value_set=int32:[\n 1,\n 2\n], skip_nulls=nullopt, null_matching_behavior=MATCH})"); EXPECT_EQ(and_(field_ref("a"), field_ref("b")).ToString(), "(a and b)"); EXPECT_EQ(or_(field_ref("a"), field_ref("b")).ToString(), "(a or b)"); From f62c50f46c6e12a0bf4c4911545d52a5b7b64ce8 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 20 Aug 2023 18:27:36 +0800 Subject: [PATCH 10/43] roll back --- cpp/src/arrow/compute/api_scalar.cc | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 7b160b9f97a9a..74660522a8f66 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -820,6 +820,34 @@ SCALAR_EAGER_BINARY(Or, "or") SCALAR_EAGER_BINARY(Xor, "xor") SCALAR_EAGER_UNARY(Invert, "invert") +// ---------------------------------------------------------------------- + +Result Compare(const Datum& left, const Datum& right, CompareOptions options, + ExecContext* ctx) { + std::string func_name; + switch (options.op) { + case CompareOperator::EQUAL: + func_name = "equal"; + break; + case CompareOperator::NOT_EQUAL: + func_name = "not_equal"; + break; + case CompareOperator::GREATER: + func_name = "greater"; + break; + case CompareOperator::GREATER_EQUAL: + func_name = "greater_equal"; + break; + case CompareOperator::LESS: + func_name = "less"; + break; + case CompareOperator::LESS_EQUAL: + func_name = "less_equal"; + break; + } + return CallFunction(func_name, {left, right}, nullptr, ctx); +} + // ---------------------------------------------------------------------- // Validity functions From 712454f12ca9e4b4d73040174bb566f199635922 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 20 Aug 2023 18:38:00 +0800 Subject: [PATCH 11/43] lint --- cpp/src/arrow/compute/api_scalar.cc | 2 +- cpp/src/arrow/compute/expression_test.cc | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 74660522a8f66..6a3df26e54825 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -281,7 +281,7 @@ struct EnumTraits compute::SetLookupOptions::NullMatchingBehavior::MATCH, compute::SetLookupOptions::NullMatchingBehavior::SKIP, compute::SetLookupOptions::NullMatchingBehavior::EMIT_NULL, - compute::SetLookupOptions::NullMatchingBehavior::INCONCLUSIVE> { + compute::SetLookupOptions::NullMatchingBehavior::INCONCLUSIVE> { static std::string name() { return "SetLookupOptions::NullMatchingBehavior"; } static std::string value_name(compute::SetLookupOptions::NullMatchingBehavior value) { switch (value) { diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index 898c511afa1f7..f9e4a9689a530 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -263,7 +263,8 @@ TEST(Expression, ToString) { compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2]")}); EXPECT_EQ(in_12.ToString(), - "index_in(beta, {value_set=int32:[\n 1,\n 2\n], skip_nulls=nullopt, null_matching_behavior=MATCH})"); + "index_in(beta, {value_set=int32:[\n 1,\n 2\n], skip_nulls=nullopt, " + "null_matching_behavior=MATCH})"); EXPECT_EQ(and_(field_ref("a"), field_ref("b")).ToString(), "(a and b)"); EXPECT_EQ(or_(field_ref("a"), field_ref("b")).ToString(), "(a or b)"); From 0ea3bdf9c107cf82a226a09305006482bcdce5da Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 20 Aug 2023 21:07:29 +0800 Subject: [PATCH 12/43] doc test --- python/pyarrow/_compute.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index ac7efeff41aba..d4e5a0db80c6f 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2325,7 +2325,7 @@ cdef class Expression(_Weakrefable): 1, 2, 3 - ], skip_nulls=false})> + ], skip_nulls=0, null_matching_behavior=MATCH})> """ def __init__(self): From 5d7602bac8f605251188c53fd507af70ec5acb8e Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sun, 20 Aug 2023 23:18:25 +0800 Subject: [PATCH 13/43] set g_lib --- c_glib/arrow-glib/compute.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 7fe005f94a5bb..090c89e1dc396 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -3346,7 +3346,11 @@ garrow_set_lookup_options_get_property(GObject *object, g_value_set_object(value, priv->value_set); break; case PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS: - g_value_set_boolean(value, options->skip_nulls); + if (!options->skip_nulls.has_value() || !options->skip_nulls.value()) { + g_value_set_boolean(value, false); + } else { + g_value_set_boolean(value, true); + } break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); From f6c41599ee13ee4e93a9248937cb3c2e378a316d Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 21 Aug 2023 00:13:37 +0800 Subject: [PATCH 14/43] glib2 --- c_glib/arrow-glib/compute.cpp | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 090c89e1dc396..3361404d2fc1b 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -3402,13 +3402,14 @@ garrow_set_lookup_options_class_init(GArrowSetLookupOptionsClass *klass) * * Since: 6.0.0 */ - spec = g_param_spec_boolean("skip-nulls", - "Skip NULLs", - "Whether NULLs are skipped or not", - options.skip_nulls, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, - PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS, + bool skip_nulls = false; + if (options.skip_nulls.has_value() && options.skip_nulls.value()) { + skip_nulls = true; + } + spec = + g_param_spec_boolean("skip-nulls", "Skip NULLs", "Whether NULLs are skipped or not", + skip_nulls, static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS, spec); } @@ -6462,9 +6463,13 @@ garrow_set_lookup_options_new_raw( arrow_copied_options.get()); auto value_set = garrow_datum_new_raw(&(arrow_copied_set_lookup_options->value_set)); + bool skip_nulls = false; + if (arrow_options->skip_nulls.has_value() && arrow_options->skip_nulls.value()) { + skip_nulls = true; + } auto options = g_object_new(GARROW_TYPE_SET_LOOKUP_OPTIONS, "value-set", value_set, - "skip-nulls", arrow_options->skip_nulls, + "skip-nulls", skip_nulls, NULL); return GARROW_SET_LOOKUP_OPTIONS(options); } From b61ba64ec33bb4bff83316607e6accffd85e9e27 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 21 Aug 2023 20:56:17 +0800 Subject: [PATCH 15/43] add a test --- .../compute/kernels/scalar_set_lookup_test.cc | 138 +++++++++++++++++- 1 file changed, 133 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index d1645eb8d9a49..2d22646ad1496 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -50,7 +50,67 @@ namespace compute { void CheckIsIn(const std::shared_ptr input, const std::shared_ptr& value_set, const std::string& expected_json, - bool skip_nulls = false) { + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + auto expected = ArrayFromJSON(boolean(), expected_json); + + ASSERT_OK_AND_ASSIGN(Datum actual_datum, + IsIn(input, SetLookupOptions(value_set, null_matching_behavior))); + std::shared_ptr actual = actual_datum.make_array(); + ValidateOutput(actual_datum); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); +} + +void CheckIsIn(const std::shared_ptr& type, const std::string& input_json, + const std::string& value_set_json, const std::string& expected_json, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + auto input = ArrayFromJSON(type, input_json); + auto value_set = ArrayFromJSON(type, value_set_json); + CheckIsIn(input, value_set, expected_json, null_matching_behavior); +} + +void CheckIsInChunked(const std::shared_ptr& input, + const std::shared_ptr& value_set, + const std::shared_ptr& expected, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + ASSERT_OK_AND_ASSIGN(Datum actual_datum, + IsIn(input, SetLookupOptions(value_set, null_matching_behavior))); + auto actual = actual_datum.chunked_array(); + ValidateOutput(actual_datum); + + // Output contiguous in a single chunk + ASSERT_EQ(1, actual->num_chunks()); + ASSERT_TRUE(actual->Equals(*expected)); +} + +void CheckIsInDictionary(const std::shared_ptr& type, + const std::shared_ptr& index_type, + const std::string& input_dictionary_json, + const std::string& input_index_json, + const std::string& value_set_json, + const std::string& expected_json, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + auto dict_type = dictionary(index_type, type); + auto indices = ArrayFromJSON(index_type, input_index_json); + auto dict = ArrayFromJSON(type, input_dictionary_json); + + ASSERT_OK_AND_ASSIGN(auto input, DictionaryArray::FromArrays(dict_type, indices, dict)); + auto value_set = ArrayFromJSON(type, value_set_json); + auto expected = ArrayFromJSON(boolean(), expected_json); + + ASSERT_OK_AND_ASSIGN(Datum actual_datum, + IsIn(input, SetLookupOptions(value_set, null_matching_behavior))); + std::shared_ptr actual = actual_datum.make_array(); + ValidateOutput(actual_datum); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); +} + +void CheckIsIn(const std::shared_ptr input, + const std::shared_ptr& value_set, const std::string& expected_json, + bool skip_nulls) { auto expected = ArrayFromJSON(boolean(), expected_json); ASSERT_OK_AND_ASSIGN(Datum actual_datum, @@ -62,7 +122,7 @@ void CheckIsIn(const std::shared_ptr input, void CheckIsIn(const std::shared_ptr& type, const std::string& input_json, const std::string& value_set_json, const std::string& expected_json, - bool skip_nulls = false) { + bool skip_nulls) { auto input = ArrayFromJSON(type, input_json); auto value_set = ArrayFromJSON(type, value_set_json); CheckIsIn(input, value_set, expected_json, skip_nulls); @@ -71,7 +131,7 @@ void CheckIsIn(const std::shared_ptr& type, const std::string& input_j void CheckIsInChunked(const std::shared_ptr& input, const std::shared_ptr& value_set, const std::shared_ptr& expected, - bool skip_nulls = false) { + bool skip_nulls) { ASSERT_OK_AND_ASSIGN(Datum actual_datum, IsIn(input, SetLookupOptions(value_set, skip_nulls))); auto actual = actual_datum.chunked_array(); @@ -87,7 +147,7 @@ void CheckIsInDictionary(const std::shared_ptr& type, const std::string& input_dictionary_json, const std::string& input_index_json, const std::string& value_set_json, - const std::string& expected_json, bool skip_nulls = false) { + const std::string& expected_json, bool skip_nulls) { auto dict_type = dictionary(index_type, type); auto indices = ArrayFromJSON(index_type, input_index_json); auto dict = ArrayFromJSON(type, input_dictionary_json); @@ -185,18 +245,30 @@ TYPED_TEST(TestIsInKernelPrimitive, IsIn) { /*skip_nulls=*/false); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[null, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[null, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Nulls in right array CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[null, true, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Nulls in both the arrays CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[true, true, true, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[null, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[null, true, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", @@ -204,6 +276,12 @@ TYPED_TEST(TestIsInKernelPrimitive, IsIn) { /*skip_nulls=*/false); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", + "[null, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", + "[null, true, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Empty Arrays CheckIsIn(type, "[]", "[]", "[]"); @@ -217,7 +295,15 @@ TEST_F(TestIsInKernel, NullType) { CheckIsIn(type, "[]", "[]", "[]"); CheckIsIn(type, "[null, null]", "[null]", "[false, false]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, null]", "[null]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, null]", "[null]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(type, "[null, null]", "[]", "[false, false]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, null]", "[]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, null]", "[]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, "[null, null, null]", "[null, null]", "[true, true, true]"); @@ -232,12 +318,24 @@ TEST_F(TestIsInKernel, TimeTimestamp) { "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", "[true, false, false, true, true]", /*skip_nulls=*/true); - + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, null, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); + // Duplicates in right array CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, null, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } // Disallow mixing timezone-aware and timezone-naive values @@ -260,12 +358,24 @@ TEST_F(TestIsInKernel, TimeDuration) { "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", "[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, null, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, null, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } // Different units, cast value_set to values will fail, then cast values to value_set @@ -285,17 +395,35 @@ TEST_F(TestIsInKernel, Boolean) { "[false, true, false, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[true, false, null, true, false]", "[false]", "[false, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[true, false, null, true, false]", "[false]", + "[false, true, null, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[true, false, null, true, false]", "[false]", + "[false, true, null, false, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", "[false, true, true, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", "[false, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", + "[false, true, null, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", + "[null, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", "[false, true, true, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", "[false, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", + "[false, true, null, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", + "[null, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } TYPED_TEST_SUITE(TestIsInKernelBinary, BaseBinaryArrowTypes); From d409b7e3fff38bd23c617015867b38bb7c3d61cd Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 21 Aug 2023 21:16:22 +0800 Subject: [PATCH 16/43] add a test2 --- .../compute/kernels/scalar_set_lookup_test.cc | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index 2d22646ad1496..76dc686226cd4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -299,6 +299,7 @@ TEST_F(TestIsInKernel, NullType) { /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); CheckIsIn(type, "[null, null]", "[null]", "[null, null]", /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); + CheckIsIn(type, "[null, null]", "[]", "[false, false]", /*skip_nulls=*/true); CheckIsIn(type, "[null, null]", "[]", "[null, null]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -308,6 +309,10 @@ TEST_F(TestIsInKernel, NullType) { // Duplicates in right array CheckIsIn(type, "[null, null, null]", "[null, null]", "[true, true, true]"); CheckIsIn(type, "[null, null]", "[null, null]", "[false, false]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, null]", "[null, null]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, null]", "[null, null]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } TEST_F(TestIsInKernel, TimeTimestamp) { @@ -437,6 +442,12 @@ TYPED_TEST(TestIsInKernelBinary, Binary) { CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", "[true, true, false, true, true]", @@ -444,6 +455,12 @@ TYPED_TEST(TestIsInKernelBinary, Binary) { CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", + "[true, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", @@ -452,6 +469,12 @@ TYPED_TEST(TestIsInKernelBinary, Binary) { CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", + R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", + R"([null, "aaa", "aaa", "", "", null])", "[true, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } TEST_F(TestIsInKernel, FixedSizeBinary) { @@ -463,6 +486,12 @@ TEST_F(TestIsInKernel, FixedSizeBinary) { CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", "[true, true, false, true, true]", @@ -470,6 +499,12 @@ TEST_F(TestIsInKernel, FixedSizeBinary) { CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", + "[true, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", @@ -480,6 +515,14 @@ TEST_F(TestIsInKernel, FixedSizeBinary) { R"(["aaa", null, "aaa", "bbb", "bbb", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", + R"(["aaa", null, "aaa", "bbb", "bbb", null])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", + R"(["aaa", null, "aaa", "bbb", "bbb", null])", + "[true, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); ASSERT_RAISES(Invalid, IsIn(ArrayFromJSON(fixed_size_binary(3), R"(["abc"])"), @@ -494,6 +537,12 @@ TEST_F(TestIsInKernel, Decimal) { CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", "[true, false, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", + "[true, false, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", + "[true, false, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9", null])", "[true, false, true, true, true]", @@ -501,6 +550,12 @@ TEST_F(TestIsInKernel, Decimal) { CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9", null])", "[true, false, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"(["12.3", "78.9", null])", "[true, false, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"(["12.3", "78.9", null])", "[true, null, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", @@ -511,6 +566,14 @@ TEST_F(TestIsInKernel, Decimal) { R"([null, "12.3", "12.3", "78.9", "78.9", null])", "[true, false, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"([null, "12.3", "12.3", "78.9", "78.9", null])", + "[true, false, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"([null, "12.3", "12.3", "78.9", "78.9", null])", + "[true, null, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(ArrayFromJSON(decimal128(4, 2), R"(["12.30", "45.60", "78.90"])"), ArrayFromJSON(type, R"(["12.3", "78.9"])"), "[true, false, true]"); From a0a511d52f419a31ecc6e6086b6b356afe4df01b Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 21 Aug 2023 21:33:25 +0800 Subject: [PATCH 17/43] add a test 3 --- .../compute/kernels/scalar_set_lookup_test.cc | 93 ++++++++++++++++++- 1 file changed, 90 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index 76dc686226cd4..3f96299c8292c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -678,6 +678,14 @@ TEST_F(TestIsInKernel, ChunkedArrayInvoke) { CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true); + expected = ChunkedArrayFromJSON( + boolean(), {"[true, true, true, true, false]", "[true, null, true, false]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + expected = ChunkedArrayFromJSON( + boolean(), {"[true, true, true, true, false]", "[true, null, true, false]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); value_set = ChunkedArrayFromJSON(utf8(), {R"(["", "def"])", R"([null])"}); expected = ChunkedArrayFromJSON( @@ -686,6 +694,14 @@ TEST_F(TestIsInKernel, ChunkedArrayInvoke) { expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, false, false, false]"}); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true); + expected = ChunkedArrayFromJSON( + boolean(), {"[false, true, true, false, false]", "[true, null, false, false]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + expected = ChunkedArrayFromJSON( + boolean(), {"[null, true, true, null, null]", "[true, null, null, null]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in value_set value_set = @@ -696,6 +712,14 @@ TEST_F(TestIsInKernel, ChunkedArrayInvoke) { expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, false, false, false]"}); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true); + expected = ChunkedArrayFromJSON( + boolean(), {"[false, true, true, false, false]", "[true, null, false, false]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + expected = ChunkedArrayFromJSON( + boolean(), {"[null, true, true, null, null]", "[true, null, null, null]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } // ---------------------------------------------------------------------- @@ -705,7 +729,70 @@ class TestIndexInKernel : public ::testing::Test { public: void CheckIndexIn(const std::shared_ptr& input, const std::shared_ptr& value_set, - const std::string& expected_json, bool skip_nulls = false) { + const std::string& expected_json, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + std::shared_ptr expected = ArrayFromJSON(int32(), expected_json); + + SetLookupOptions options(value_set, null_matching_behavior); + ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(input, options)); + std::shared_ptr actual = actual_datum.make_array(); + ValidateOutput(actual_datum); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); + } + + void CheckIndexIn(const std::shared_ptr& type, const std::string& input_json, + const std::string& value_set_json, const std::string& expected_json, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + std::shared_ptr input = ArrayFromJSON(type, input_json); + std::shared_ptr value_set = ArrayFromJSON(type, value_set_json); + return CheckIndexIn(input, value_set, expected_json, null_matching_behavior); + } + + void CheckIndexInChunked(const std::shared_ptr& input, + const std::shared_ptr& value_set, + const std::shared_ptr& expected, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + ASSERT_OK_AND_ASSIGN( + Datum actual, + IndexIn(input, SetLookupOptions(value_set, null_matching_behavior))); + ASSERT_EQ(Datum::CHUNKED_ARRAY, actual.kind()); + ValidateOutput(actual); + + auto actual_chunked = actual.chunked_array(); + + // Output contiguous in a single chunk + ASSERT_EQ(1, actual_chunked->num_chunks()); + ASSERT_TRUE(actual_chunked->Equals(*expected)); + } + + void CheckIndexInDictionary( + const std::shared_ptr& type, const std::shared_ptr& index_type, + const std::string& input_dictionary_json, const std::string& input_index_json, + const std::string& value_set_json, const std::string& expected_json, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + auto dict_type = dictionary(index_type, type); + auto indices = ArrayFromJSON(index_type, input_index_json); + auto dict = ArrayFromJSON(type, input_dictionary_json); + + ASSERT_OK_AND_ASSIGN(auto input, + DictionaryArray::FromArrays(dict_type, indices, dict)); + auto value_set = ArrayFromJSON(type, value_set_json); + auto expected = ArrayFromJSON(int32(), expected_json); + + SetLookupOptions options(value_set, null_matching_behavior); + ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(input, options)); + std::shared_ptr actual = actual_datum.make_array(); + ValidateOutput(actual_datum); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); + } + + void CheckIndexIn(const std::shared_ptr& input, + const std::shared_ptr& value_set, + const std::string& expected_json, bool skip_nulls) { std::shared_ptr expected = ArrayFromJSON(int32(), expected_json); SetLookupOptions options(value_set, skip_nulls); @@ -717,7 +804,7 @@ class TestIndexInKernel : public ::testing::Test { void CheckIndexIn(const std::shared_ptr& type, const std::string& input_json, const std::string& value_set_json, const std::string& expected_json, - bool skip_nulls = false) { + bool skip_nulls) { std::shared_ptr input = ArrayFromJSON(type, input_json); std::shared_ptr value_set = ArrayFromJSON(type, value_set_json); return CheckIndexIn(input, value_set, expected_json, skip_nulls); @@ -744,7 +831,7 @@ class TestIndexInKernel : public ::testing::Test { const std::string& input_dictionary_json, const std::string& input_index_json, const std::string& value_set_json, - const std::string& expected_json, bool skip_nulls = false) { + const std::string& expected_json, bool skip_nulls) { auto dict_type = dictionary(index_type, type); auto indices = ArrayFromJSON(index_type, input_index_json); auto dict = ArrayFromJSON(type, input_dictionary_json); From aa9eb0110eaf90bc04b367daba58925e75097c68 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 21 Aug 2023 22:00:33 +0800 Subject: [PATCH 18/43] add a test4 --- .../compute/kernels/scalar_set_lookup_test.cc | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index 3f96299c8292c..4ffd57a0a9c08 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -643,6 +643,52 @@ TEST_F(TestIsInKernel, DictionaryArray) { /*expected_json=*/"[false, false, false, true, false]", /*skip_nulls=*/true); + // With nulls and null_matching_behavior=EMIT_NULL + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[true, false, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[null, false, null, true, null]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[null, false, null, true, null]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + + // With nulls and null_matching_behavior=INCONCLUSIVE + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[null, null, null, true, null]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[null, false, null, true, null]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); + // With duplicates in value_set CheckIsInDictionary(/*type=*/utf8(), /*index_type=*/index_ty, @@ -665,6 +711,20 @@ TEST_F(TestIsInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])", /*expected_json=*/"[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])", + /*expected_json=*/"[true, false, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])", + /*expected_json=*/"[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } } From f1297544261e651a4e1df00e975e0e1e1bb782eb Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 21 Aug 2023 22:14:55 +0800 Subject: [PATCH 19/43] lint --- cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index 4ffd57a0a9c08..adad7feb642d2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -130,8 +130,7 @@ void CheckIsIn(const std::shared_ptr& type, const std::string& input_j void CheckIsInChunked(const std::shared_ptr& input, const std::shared_ptr& value_set, - const std::shared_ptr& expected, - bool skip_nulls) { + const std::shared_ptr& expected, bool skip_nulls) { ASSERT_OK_AND_ASSIGN(Datum actual_datum, IsIn(input, SetLookupOptions(value_set, skip_nulls))); auto actual = actual_datum.chunked_array(); @@ -329,7 +328,7 @@ TEST_F(TestIsInKernel, TimeTimestamp) { CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", "[true, null, null, true, true]", /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); - + // Duplicates in right array CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, true, false, true, true]", /*skip_nulls=*/false); From bf7c145efabaf317e86d36147c7c675534da4392 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 22 Aug 2023 07:13:38 +0800 Subject: [PATCH 20/43] Update c_glib/arrow-glib/compute.cpp Co-authored-by: Sutou Kouhei --- c_glib/arrow-glib/compute.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 3361404d2fc1b..6ff04616cceeb 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -6463,10 +6463,7 @@ garrow_set_lookup_options_new_raw( arrow_copied_options.get()); auto value_set = garrow_datum_new_raw(&(arrow_copied_set_lookup_options->value_set)); - bool skip_nulls = false; - if (arrow_options->skip_nulls.has_value() && arrow_options->skip_nulls.value()) { - skip_nulls = true; - } + auto skip_nulls = (arrow_options->skip_nulls.has_value() && arrow_options->skip_nulls.value()); auto options = g_object_new(GARROW_TYPE_SET_LOOKUP_OPTIONS, "value-set", value_set, "skip-nulls", skip_nulls, From bb55eda4ed51b1f732c86b2e60a67ce3af8e0f8c Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 22 Aug 2023 07:13:46 +0800 Subject: [PATCH 21/43] Update c_glib/arrow-glib/compute.cpp Co-authored-by: Sutou Kouhei --- c_glib/arrow-glib/compute.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 6ff04616cceeb..586426ab2f4c3 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -3402,10 +3402,7 @@ garrow_set_lookup_options_class_init(GArrowSetLookupOptionsClass *klass) * * Since: 6.0.0 */ - bool skip_nulls = false; - if (options.skip_nulls.has_value() && options.skip_nulls.value()) { - skip_nulls = true; - } + auto skip_nulls = (options.skip_nulls.has_value() && options.skip_nulls.value()); spec = g_param_spec_boolean("skip-nulls", "Skip NULLs", "Whether NULLs are skipped or not", skip_nulls, static_cast(G_PARAM_READWRITE)); From 039eeed69b730f15d73549e3f3347b6baa894a3b Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 22 Aug 2023 07:14:15 +0800 Subject: [PATCH 22/43] Update c_glib/arrow-glib/compute.cpp Co-authored-by: Sutou Kouhei --- c_glib/arrow-glib/compute.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 586426ab2f4c3..9692f277d183f 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -3346,11 +3346,7 @@ garrow_set_lookup_options_get_property(GObject *object, g_value_set_object(value, priv->value_set); break; case PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS: - if (!options->skip_nulls.has_value() || !options->skip_nulls.value()) { - g_value_set_boolean(value, false); - } else { - g_value_set_boolean(value, true); - } + g_value_set_boolean(value, options->skip_nulls.has_value() && options->skip_nulls.value()); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); From bba645beb2395d4d52673b8c104f306991954471 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 23 Aug 2023 09:22:55 +0800 Subject: [PATCH 23/43] doc --- cpp/src/arrow/compute/api_scalar.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 19db83b723dd5..8714d4a245530 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -271,6 +271,7 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { enum NullMatchingBehavior { MATCH, SKIP, EMIT_NULL, INCONCLUSIVE }; explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH); + // DEPRECATED(will be removed after removing of skip_nulls) explicit SetLookupOptions(Datum value_set, bool skip_nulls); SetLookupOptions(); static constexpr char const kTypeName[] = "SetLookupOptions"; @@ -278,6 +279,19 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { /// The set of values to look up input values into. Datum value_set; + /// How to match null values. + /// + /// Match, any null in `value_set` is successfully matched in + /// the input. + /// SKIP, any null in `value_set` is ignored and nulls in the input + /// produce null (IndexIn) or false (IsIn) values in the output. + /// EMIT_NULL, any null in `value_set` is ignored and nulls in the + /// input produce null (IndexIn and IsIn) values in the output. + /// INCONCLUSIVE, null values are regard as unknown values, which is + /// sql-compatible. nulls in the input produce null (IndexIn and IsIn) + /// values in the output. Besides, if `value_set` contains a null, + /// non-null unmatched values in the input also produce null values + /// (IndexIn and IsIn) in the output. NullMatchingBehavior null_matching_behavior; // DEPRECATED(will be removed after removing of skip_nulls) From e2aa28a62a33325eb7d54b34d376f2999fefe91f Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Tue, 29 Aug 2023 23:24:45 +0800 Subject: [PATCH 24/43] fix comments --- cpp/src/arrow/compute/api_scalar.cc | 10 ++-------- cpp/src/arrow/compute/api_scalar.h | 2 +- cpp/src/arrow/compute/kernels/scalar_set_lookup.cc | 4 ++-- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 01f705f7f84c1..76707cf143c3b 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -564,13 +564,7 @@ constexpr char RoundToMultipleOptions::kTypeName[]; SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls) : FunctionOptions(internal::kSetLookupOptionsType), value_set(std::move(value_set)), - skip_nulls(skip_nulls) { - if (skip_nulls) { - this->null_matching_behavior = SetLookupOptions::SKIP; - } else { - this->null_matching_behavior = SetLookupOptions::MATCH; - } -} + skip_nulls(std::move(skip_nulls)) {} SetLookupOptions::SetLookupOptions( Datum value_set, SetLookupOptions::NullMatchingBehavior null_matching_behavior) : FunctionOptions(internal::kSetLookupOptionsType), @@ -578,7 +572,7 @@ SetLookupOptions::SetLookupOptions( null_matching_behavior(std::move(null_matching_behavior)) {} SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, SetLookupOptions::NullMatchingBehavior::MATCH) {} -SetLookupOptions::NullMatchingBehavior SetLookupOptions::getNullMatchingBehavior() const { +SetLookupOptions::NullMatchingBehavior SetLookupOptions::GetNullMatchingBehavior() const { if (!this->skip_nulls.has_value()) { return this->null_matching_behavior; } else if (this->skip_nulls.value()) { diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 8714d4a245530..7391f12bd6b4e 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -295,7 +295,7 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { NullMatchingBehavior null_matching_behavior; // DEPRECATED(will be removed after removing of skip_nulls) - NullMatchingBehavior getNullMatchingBehavior() const; + NullMatchingBehavior GetNullMatchingBehavior() const; // DEPRECATED(use null_matching_behavior instead) /// Whether nulls in `value_set` count for lookup. diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index d5cf258a267f5..dd6fdc0212db7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -44,7 +44,7 @@ struct SetLookupState : public SetLookupStateBase { explicit SetLookupState(MemoryPool* pool) : memory_pool(pool) {} Status Init(const SetLookupOptions& options) { - this->null_matching_behavior = options.getNullMatchingBehavior(); + this->null_matching_behavior = options.GetNullMatchingBehavior(); if (options.value_set.is_array()) { const ArrayData& value_set = *options.value_set.array(); memo_index_to_value_index.reserve(value_set.length); @@ -127,7 +127,7 @@ struct SetLookupState : public SetLookupStateBase { explicit SetLookupState(MemoryPool*) {} Status Init(SetLookupOptions& options) { - null_matching_behavior = options.getNullMatchingBehavior(); + null_matching_behavior = options.GetNullMatchingBehavior(); value_set_has_null = (options.value_set.length() > 0) && this->null_matching_behavior != SetLookupOptions::SKIP; value_set_type = null(); From cfbb4ce56c834c810755fb01392329ddab9787a6 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 30 Aug 2023 09:25:10 +0800 Subject: [PATCH 25/43] fix comments 2 --- cpp/src/arrow/compute/api_scalar.h | 6 +++--- python/pyarrow/_compute.pyx | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 7391f12bd6b4e..b7ffcf85a457d 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -271,7 +271,7 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { enum NullMatchingBehavior { MATCH, SKIP, EMIT_NULL, INCONCLUSIVE }; explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH); - // DEPRECATED(will be removed after removing of skip_nulls) + ARROW_DEPRECATED("will be removed after removing of skip_nulls") explicit SetLookupOptions(Datum value_set, bool skip_nulls); SetLookupOptions(); static constexpr char const kTypeName[] = "SetLookupOptions"; @@ -294,16 +294,16 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { /// (IndexIn and IsIn) in the output. NullMatchingBehavior null_matching_behavior; - // DEPRECATED(will be removed after removing of skip_nulls) + ARROW_DEPRECATED("will be removed after removing of skip_nulls") NullMatchingBehavior GetNullMatchingBehavior() const; - // DEPRECATED(use null_matching_behavior instead) /// Whether nulls in `value_set` count for lookup. /// /// If true, any null in `value_set` is ignored and nulls in the input /// produce null (IndexIn) or false (IsIn) values in the output. /// If false, any null in `value_set` is successfully matched in /// the input. + ARROW_DEPRECATED("use null_matching_behavior instead") std::optional skip_nulls; }; diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index bc1f5aa8ac16b..e20280ac0cfb0 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2325,7 +2325,7 @@ cdef class Expression(_Weakrefable): 1, 2, 3 - ], skip_nulls=0, null_matching_behavior=MATCH})> + ], skip_nulls=0, null_matching_behavior=})> """ def __init__(self): From 021223bc47d79d6c9d00909ed80a2595f1880138 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 30 Aug 2023 09:34:25 +0800 Subject: [PATCH 26/43] DEPRECATED --- cpp/src/arrow/compute/api_scalar.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index b7ffcf85a457d..08634a0410ead 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -271,9 +271,11 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { enum NullMatchingBehavior { MATCH, SKIP, EMIT_NULL, INCONCLUSIVE }; explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH); - ARROW_DEPRECATED("will be removed after removing of skip_nulls") - explicit SetLookupOptions(Datum value_set, bool skip_nulls); SetLookupOptions(); + + ARROW_DEPRECATED("Will be removed after removing of skip_nulls") + explicit SetLookupOptions(Datum value_set, bool skip_nulls); + static constexpr char const kTypeName[] = "SetLookupOptions"; /// The set of values to look up input values into. @@ -294,16 +296,16 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { /// (IndexIn and IsIn) in the output. NullMatchingBehavior null_matching_behavior; - ARROW_DEPRECATED("will be removed after removing of skip_nulls") + ARROW_DEPRECATED("Will be removed after removing of skip_nulls") NullMatchingBehavior GetNullMatchingBehavior() const; + // DEPRECATED(use null_matching_behavior instead) /// Whether nulls in `value_set` count for lookup. /// /// If true, any null in `value_set` is ignored and nulls in the input /// produce null (IndexIn) or false (IsIn) values in the output. /// If false, any null in `value_set` is successfully matched in /// the input. - ARROW_DEPRECATED("use null_matching_behavior instead") std::optional skip_nulls; }; From 76b353a43e09ca3dd4cbcbe4ca53705b5a9f637d Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 30 Aug 2023 10:10:17 +0800 Subject: [PATCH 27/43] Deprecated update --- cpp/src/arrow/compute/api_scalar.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 08634a0410ead..07c50eaf1a351 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -273,7 +273,7 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH); SetLookupOptions(); - ARROW_DEPRECATED("Will be removed after removing of skip_nulls") + ARROW_DEPRECATED("Deprecated in 14.0.0. Will be removed after removing of skip_nulls") explicit SetLookupOptions(Datum value_set, bool skip_nulls); static constexpr char const kTypeName[] = "SetLookupOptions"; @@ -296,7 +296,7 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { /// (IndexIn and IsIn) in the output. NullMatchingBehavior null_matching_behavior; - ARROW_DEPRECATED("Will be removed after removing of skip_nulls") + ARROW_DEPRECATED("Deprecated in 14.0.0. Will be removed after removing of skip_nulls") NullMatchingBehavior GetNullMatchingBehavior() const; // DEPRECATED(use null_matching_behavior instead) From 4338eac0ff62d332c72184e11d6700c6d90cee5f Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 30 Aug 2023 11:39:58 +0800 Subject: [PATCH 28/43] roll back --- cpp/src/arrow/compute/api_scalar.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 07c50eaf1a351..2572e8b414d9a 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -273,7 +273,7 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH); SetLookupOptions(); - ARROW_DEPRECATED("Deprecated in 14.0.0. Will be removed after removing of skip_nulls") + // DEPRECATED(will be removed after removing of skip_nulls) explicit SetLookupOptions(Datum value_set, bool skip_nulls); static constexpr char const kTypeName[] = "SetLookupOptions"; @@ -296,7 +296,7 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { /// (IndexIn and IsIn) in the output. NullMatchingBehavior null_matching_behavior; - ARROW_DEPRECATED("Deprecated in 14.0.0. Will be removed after removing of skip_nulls") + // DEPRECATED(will be removed after removing of skip_nulls) NullMatchingBehavior GetNullMatchingBehavior() const; // DEPRECATED(use null_matching_behavior instead) From e61278b329a8ac644b4a5fb6c6f61154b363716e Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 30 Aug 2023 14:21:26 +0800 Subject: [PATCH 29/43] roll back --- cpp/src/arrow/compute/api_scalar.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 76707cf143c3b..0ee4d983227b9 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -564,7 +564,13 @@ constexpr char RoundToMultipleOptions::kTypeName[]; SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls) : FunctionOptions(internal::kSetLookupOptionsType), value_set(std::move(value_set)), - skip_nulls(std::move(skip_nulls)) {} + skip_nulls(skip_nulls) { + if (skip_nulls) { + this->null_matching_behavior = SetLookupOptions::SKIP; + } else { + this->null_matching_behavior = SetLookupOptions::MATCH; + } +} SetLookupOptions::SetLookupOptions( Datum value_set, SetLookupOptions::NullMatchingBehavior null_matching_behavior) : FunctionOptions(internal::kSetLookupOptionsType), From 14fcc44d21221d6e9bff8a078307634bff42da69 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 30 Aug 2023 15:13:19 +0800 Subject: [PATCH 30/43] Update _compute.pyx --- python/pyarrow/_compute.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index e20280ac0cfb0..bc1f5aa8ac16b 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2325,7 +2325,7 @@ cdef class Expression(_Weakrefable): 1, 2, 3 - ], skip_nulls=0, null_matching_behavior=})> + ], skip_nulls=0, null_matching_behavior=MATCH})> """ def __init__(self): From f26892a989d77a65e47d3ca7f1343f5cd5cb0125 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Mon, 4 Sep 2023 14:48:32 +0800 Subject: [PATCH 31/43] Update scalar_set_lookup.cc From ef4d4b9a708d90ac9f3bd52697b3f534a189f5bd Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 20 Sep 2023 08:44:20 +0800 Subject: [PATCH 32/43] Update cpp/src/arrow/compute/api_scalar.h Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/compute/api_scalar.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 2572e8b414d9a..05bf3ca7b1978 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -283,7 +283,7 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { /// How to match null values. /// - /// Match, any null in `value_set` is successfully matched in + /// MATCH, any null in `value_set` is successfully matched in /// the input. /// SKIP, any null in `value_set` is ignored and nulls in the input /// produce null (IndexIn) or false (IsIn) values in the output. From 9628bf222701974d246bce6dc163b7c41470ec4e Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 20 Sep 2023 08:44:28 +0800 Subject: [PATCH 33/43] Update cpp/src/arrow/compute/api_scalar.h Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/compute/api_scalar.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 05bf3ca7b1978..5b33491f20964 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -289,7 +289,7 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { /// produce null (IndexIn) or false (IsIn) values in the output. /// EMIT_NULL, any null in `value_set` is ignored and nulls in the /// input produce null (IndexIn and IsIn) values in the output. - /// INCONCLUSIVE, null values are regard as unknown values, which is + /// INCONCLUSIVE, null values are regarded as unknown values, which is /// sql-compatible. nulls in the input produce null (IndexIn and IsIn) /// values in the output. Besides, if `value_set` contains a null, /// non-null unmatched values in the input also produce null values From f057ca69dcf1604acbe757dd9979a60aa3f8f709 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 20 Sep 2023 09:15:16 +0800 Subject: [PATCH 34/43] move comment --- cpp/src/arrow/compute/api_scalar.h | 32 +++++++++++-------- .../compute/kernels/scalar_set_lookup.cc | 10 +++--- .../compute/kernels/scalar_set_lookup_test.cc | 18 +++++++++++ 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 07c50eaf1a351..ca219a14ffb1b 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -268,7 +268,24 @@ class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions { /// Options for IsIn and IndexIn functions class ARROW_EXPORT SetLookupOptions : public FunctionOptions { public: - enum NullMatchingBehavior { MATCH, SKIP, EMIT_NULL, INCONCLUSIVE }; + /// How to handle null values. + enum NullMatchingBehavior { + /// Match, any null in `value_set` is successfully matched in + /// the input. + MATCH, + /// SKIP, any null in `value_set` is ignored and nulls in the input + /// produce null (IndexIn) or false (IsIn) values in the output. + SKIP, + /// EMIT_NULL, any null in `value_set` is ignored and nulls in the + /// input produce null (IndexIn and IsIn) values in the output. + EMIT_NULL, + /// INCONCLUSIVE, null values are regard as unknown values, which is + /// sql-compatible. nulls in the input produce null (IndexIn and IsIn) + /// values in the output. Besides, if `value_set` contains a null, + /// non-null unmatched values in the input also produce null values + /// (IndexIn and IsIn) in the output. + INCONCLUSIVE + }; explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH); SetLookupOptions(); @@ -281,19 +298,6 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions { /// The set of values to look up input values into. Datum value_set; - /// How to match null values. - /// - /// Match, any null in `value_set` is successfully matched in - /// the input. - /// SKIP, any null in `value_set` is ignored and nulls in the input - /// produce null (IndexIn) or false (IsIn) values in the output. - /// EMIT_NULL, any null in `value_set` is ignored and nulls in the - /// input produce null (IndexIn and IsIn) values in the output. - /// INCONCLUSIVE, null values are regard as unknown values, which is - /// sql-compatible. nulls in the input produce null (IndexIn and IsIn) - /// values in the output. Besides, if `value_set` contains a null, - /// non-null unmatched values in the input also produce null values - /// (IndexIn and IsIn) in the output. NullMatchingBehavior null_matching_behavior; ARROW_DEPRECATED("Deprecated in 14.0.0. Will be removed after removing of skip_nulls") diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index dd6fdc0212db7..5bac2fd5831fd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -275,8 +275,9 @@ struct IndexInVisitor { IndexInVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out) : ctx(ctx), data(data), out(out), out_bitmap(out->buffers[0].data) {} - Status Visit(const DataType& type) { - DCHECK_EQ(type.id(), Type::NA); + Status Visit(const DataType& type) { DCHECK(false) << "IndexIn " << type; } + + Status Visit(const NullType&) { const auto& state = checked_cast&>(*ctx->state()); if (data.length != 0) { @@ -403,8 +404,9 @@ struct IsInVisitor { out_boolean_bitmap(out->buffers[1].data), out_null_bitmap(out->buffers[0].data) {} - Status Visit(const DataType& type) { - DCHECK_EQ(type.id(), Type::NA); + Status Visit(const DataType& type) { DCHECK(false) << "IndexIn " << type; } + + Status Visit(const NullType&) { const auto& state = checked_cast&>(*ctx->state()); if (state.null_matching_behavior == SetLookupOptions::MATCH && diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index adad7feb642d2..a1755092035e7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -244,6 +244,10 @@ TYPED_TEST(TestIsInKernelPrimitive, IsIn) { /*skip_nulls=*/false); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[null, true, true, false, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[null, true, true, false, true]", @@ -254,6 +258,10 @@ TYPED_TEST(TestIsInKernelPrimitive, IsIn) { /*skip_nulls=*/false); CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[null, true, true, null, true]", @@ -264,6 +272,10 @@ TYPED_TEST(TestIsInKernelPrimitive, IsIn) { /*skip_nulls=*/false); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[true, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[null, true, true, false, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[null, true, true, null, true]", @@ -275,6 +287,12 @@ TYPED_TEST(TestIsInKernelPrimitive, IsIn) { /*skip_nulls=*/false); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", + "[true, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", + "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", "[null, true, true, false, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); From abfe890270c9da2cce944f9496df971c5a0613e6 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 20 Sep 2023 09:33:40 +0800 Subject: [PATCH 35/43] NotImplemented --- .../compute/kernels/scalar_set_lookup.cc | 10 +++- .../compute/kernels/scalar_set_lookup_test.cc | 48 +++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index 5bac2fd5831fd..2dec960e05833 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -275,7 +275,10 @@ struct IndexInVisitor { IndexInVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out) : ctx(ctx), data(data), out(out), out_bitmap(out->buffers[0].data) {} - Status Visit(const DataType& type) { DCHECK(false) << "IndexIn " << type; } + Status Visit(const DataType& type) { + DCHECK(false) << "IndexIn " << type; + Status::NotImplemented("IndexIn has no implementation with value type ", type); + } Status Visit(const NullType&) { const auto& state = checked_cast&>(*ctx->state()); @@ -404,7 +407,10 @@ struct IsInVisitor { out_boolean_bitmap(out->buffers[1].data), out_null_bitmap(out->buffers[0].data) {} - Status Visit(const DataType& type) { DCHECK(false) << "IndexIn " << type; } + Status Visit(const DataType& type) { + DCHECK(false) << "IndexIn " << type; + Status::NotImplemented("IsIn has no implementation with value type ", type); + } Status Visit(const NullType&) { const auto& state = checked_cast&>(*ctx->state()); diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index a1755092035e7..2125f116f1916 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -312,12 +312,16 @@ TEST_F(TestIsInKernel, NullType) { CheckIsIn(type, "[]", "[]", "[]"); CheckIsIn(type, "[null, null]", "[null]", "[false, false]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, null]", "[null]", "[false, false]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[null, null]", "[null]", "[null, null]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); CheckIsIn(type, "[null, null]", "[null]", "[null, null]", /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(type, "[null, null]", "[]", "[false, false]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, null]", "[]", "[false, false]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[null, null]", "[]", "[null, null]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); CheckIsIn(type, "[null, null]", "[]", "[null, null]", @@ -326,6 +330,8 @@ TEST_F(TestIsInKernel, NullType) { // Duplicates in right array CheckIsIn(type, "[null, null, null]", "[null, null]", "[true, true, true]"); CheckIsIn(type, "[null, null]", "[null, null]", "[false, false]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, null]", "[null, null]", "[false, false]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[null, null]", "[null, null]", "[null, null]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); CheckIsIn(type, "[null, null]", "[null, null]", "[null, null]", @@ -340,6 +346,12 @@ TEST_F(TestIsInKernel, TimeTimestamp) { "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", "[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", "[true, null, false, true, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -352,6 +364,12 @@ TEST_F(TestIsInKernel, TimeTimestamp) { "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, null, false, true, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -380,6 +398,12 @@ TEST_F(TestIsInKernel, TimeDuration) { "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", "[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", "[true, null, false, true, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -392,6 +416,12 @@ TEST_F(TestIsInKernel, TimeDuration) { "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, null, false, true, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -417,6 +447,12 @@ TEST_F(TestIsInKernel, Boolean) { "[false, true, false, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[true, false, null, true, false]", "[false]", "[false, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[true, false, null, true, false]", "[false]", + "[false, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[true, false, null, true, false]", "[false]", + "[false, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[true, false, null, true, false]", "[false]", "[false, true, null, false, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -428,6 +464,12 @@ TEST_F(TestIsInKernel, Boolean) { "[false, true, true, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", "[false, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", + "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", + "[false, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", "[false, true, null, false, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -440,6 +482,12 @@ TEST_F(TestIsInKernel, Boolean) { "[false, true, true, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", "[false, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", + "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", + "[false, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", "[false, true, null, false, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); From 2115e22e8def4e181b62d42f39ad2aa9246291d7 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 20 Sep 2023 09:35:42 +0800 Subject: [PATCH 36/43] return --- cpp/src/arrow/compute/kernels/scalar_set_lookup.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index 2dec960e05833..e2d5583e36e6b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -277,7 +277,7 @@ struct IndexInVisitor { Status Visit(const DataType& type) { DCHECK(false) << "IndexIn " << type; - Status::NotImplemented("IndexIn has no implementation with value type ", type); + return Status::NotImplemented("IndexIn has no implementation with value type ", type); } Status Visit(const NullType&) { @@ -409,7 +409,7 @@ struct IsInVisitor { Status Visit(const DataType& type) { DCHECK(false) << "IndexIn " << type; - Status::NotImplemented("IsIn has no implementation with value type ", type); + return Status::NotImplemented("IsIn has no implementation with value type ", type); } Status Visit(const NullType&) { From 2352e42f9e15eca1d2a4b7d96cd36aafb986155a Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 20 Sep 2023 09:51:09 +0800 Subject: [PATCH 37/43] add default test --- .../compute/kernels/scalar_set_lookup_test.cc | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index 2125f116f1916..9d430a2546dda 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -275,6 +275,7 @@ TYPED_TEST(TestIsInKernelPrimitive, IsIn) { CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[true, true, true, false, true]", /*null_matching_behavior=*/SetLookupOptions::MATCH); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", + "[false, true, true, false, true]", /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[null, true, true, false, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -507,6 +508,12 @@ TYPED_TEST(TestIsInKernelBinary, Binary) { CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", "[true, true, false, null, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -520,6 +527,12 @@ TYPED_TEST(TestIsInKernelBinary, Binary) { CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", "[true, true, false, null, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -534,6 +547,12 @@ TYPED_TEST(TestIsInKernelBinary, Binary) { CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", + R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", + R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, null, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -551,6 +570,12 @@ TEST_F(TestIsInKernel, FixedSizeBinary) { CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", "[true, true, false, null, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -564,6 +589,12 @@ TEST_F(TestIsInKernel, FixedSizeBinary) { CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", "[true, true, false, null, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -580,6 +611,14 @@ TEST_F(TestIsInKernel, FixedSizeBinary) { R"(["aaa", null, "aaa", "bbb", "bbb", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", + R"(["aaa", null, "aaa", "bbb", "bbb", null])", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", + R"(["aaa", null, "aaa", "bbb", "bbb", null])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", null, "aaa", "bbb", "bbb", null])", "[true, true, false, null, true]", @@ -602,6 +641,12 @@ TEST_F(TestIsInKernel, Decimal) { CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", "[true, false, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", + "[true, false, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", + "[true, false, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", "[true, false, true, null, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -615,6 +660,12 @@ TEST_F(TestIsInKernel, Decimal) { CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9", null])", "[true, false, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"(["12.3", "78.9", null])", "[true, false, true, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"(["12.3", "78.9", null])", "[true, false, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9", null])", "[true, false, true, null, true]", /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); @@ -631,6 +682,14 @@ TEST_F(TestIsInKernel, Decimal) { R"([null, "12.3", "12.3", "78.9", "78.9", null])", "[true, false, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"([null, "12.3", "12.3", "78.9", "78.9", null])", + "[true, false, true, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"([null, "12.3", "12.3", "78.9", "78.9", null])", + "[true, false, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"([null, "12.3", "12.3", "78.9", "78.9", null])", "[true, false, true, null, true]", @@ -661,6 +720,20 @@ TEST_F(TestIsInKernel, DictionaryArray) { /*value_set_json=*/"[4.1, 42, -1.0]", /*expected_json=*/"[true, true, false, true]", /*skip_nulls=*/false); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/R"(["A", "B", "C"])", + /*expected_json=*/"[true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInDictionary(/*type=*/float32(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/"[4.1, -1.0, 42, 9.8]", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/"[4.1, 42, -1.0]", + /*expected_json=*/"[true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); // With nulls and skip_nulls=false CheckIsInDictionary(/*type=*/utf8(), @@ -684,6 +757,27 @@ TEST_F(TestIsInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "B", "A"])", /*expected_json=*/"[false, false, false, true, false]", /*skip_nulls=*/false); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[true, false, true, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[true, false, true, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[false, false, false, true, false]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); // With nulls and skip_nulls=true CheckIsInDictionary(/*type=*/utf8(), @@ -707,6 +801,27 @@ TEST_F(TestIsInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "B", "A"])", /*expected_json=*/"[false, false, false, true, false]", /*skip_nulls=*/true); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[false, false, false, true, false]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[false, false, false, true, false]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // With nulls and null_matching_behavior=EMIT_NULL CheckIsInDictionary(/*type=*/utf8(), @@ -776,6 +891,27 @@ TEST_F(TestIsInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])", /*expected_json=*/"[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/R"(["A", "A", "B", "A", "B", "C"])", + /*expected_json=*/"[true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])", + /*expected_json=*/"[true, false, true, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])", + /*expected_json=*/"[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIsInDictionary(/*type=*/utf8(), /*index_type=*/index_ty, /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", From e71fc27cea404aac9a66d3c5d1bc99db7e55c350 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 20 Sep 2023 10:02:40 +0800 Subject: [PATCH 38/43] add test for isin --- .../arrow/compute/kernels/scalar_set_lookup_test.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index 9d430a2546dda..95dc22b57cb75 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -939,6 +939,10 @@ TEST_F(TestIsInKernel, ChunkedArrayInvoke) { CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); expected = ChunkedArrayFromJSON( boolean(), {"[true, true, true, true, false]", "[true, null, true, false]"}); CheckIsInChunked(input, value_set, expected, @@ -952,9 +956,13 @@ TEST_F(TestIsInKernel, ChunkedArrayInvoke) { expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, true, false, false]"}); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, false, false, false]"}); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, null, false, false]"}); CheckIsInChunked(input, value_set, expected, @@ -970,9 +978,13 @@ TEST_F(TestIsInKernel, ChunkedArrayInvoke) { expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, true, false, false]"}); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, false, false, false]"}); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, null, false, false]"}); CheckIsInChunked(input, value_set, expected, From d16163931e042d12d185520db41880acb28a055a Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 20 Sep 2023 10:24:21 +0800 Subject: [PATCH 39/43] add test for indexin --- .../compute/kernels/scalar_set_lookup_test.cc | 189 ++++++++++++++++++ 1 file changed, 189 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index 95dc22b57cb75..89e10d1b54103 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -1207,6 +1207,16 @@ TYPED_TEST(TestIndexInKernelPrimitive, SkipNulls) { /*value_set=*/"[1, 3]", /*expected=*/"[null, 0, null, 1, null]", /*skip_nulls=*/true); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 3]", + /*expected=*/"[null, 0, null, 1, null]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 3]", + /*expected=*/"[null, 0, null, 1, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Same with duplicates in value_set this->CheckIndexIn(type, /*input=*/"[0, 1, 2, 3, null]", @@ -1218,6 +1228,16 @@ TYPED_TEST(TestIndexInKernelPrimitive, SkipNulls) { /*value_set=*/"[1, 1, 3, 3]", /*expected=*/"[null, 0, null, 2, null]", /*skip_nulls=*/true); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 1, 3, 3]", + /*expected=*/"[null, 0, null, 2, null]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 1, 3, 3]", + /*expected=*/"[null, 0, null, 2, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Nulls in value_set this->CheckIndexIn(type, @@ -1230,12 +1250,27 @@ TYPED_TEST(TestIndexInKernelPrimitive, SkipNulls) { /*value_set=*/"[1, 1, null, null, 3, 3]", /*expected=*/"[null, 0, null, 4, null]", /*skip_nulls=*/true); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, null, 3]", + /*expected=*/"[null, 0, null, 2, 1]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 1, null, null, 3, 3]", + /*expected=*/"[null, 0, null, 4, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Same with duplicates in value_set this->CheckIndexIn(type, /*input=*/"[0, 1, 2, 3, null]", /*value_set=*/"[1, 1, null, null, 3, 3]", /*expected=*/"[null, 0, null, 4, 2]", /*skip_nulls=*/false); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 1, null, null, 3, 3]", + /*expected=*/"[null, 0, null, 4, 2]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); } TEST_F(TestIndexInKernel, NullType) { @@ -1246,6 +1281,10 @@ TEST_F(TestIndexInKernel, NullType) { CheckIndexIn(null(), "[null, null]", "[null]", "[null, null]", /*skip_nulls=*/true); CheckIndexIn(null(), "[null, null]", "[]", "[null, null]", /*skip_nulls=*/true); + CheckIndexIn(null(), "[null, null]", "[null]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIndexIn(null(), "[null, null]", "[]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); } TEST_F(TestIndexInKernel, TimeTimestamp) { @@ -1530,6 +1569,11 @@ TEST_F(TestIndexInKernel, FixedSizeBinary) { /*value_set=*/R"(["aaa", null, "bbb", "ccc"])", /*expected=*/R"([2, null, null, 0, 3, 0])", /*skip_nulls=*/true); + CheckIndexIn(fixed_size_binary(3), + /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", + /*value_set=*/R"(["aaa", null, "bbb", "ccc"])", + /*expected=*/R"([2, null, null, 0, 3, 0])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIndexIn(fixed_size_binary(3), /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", @@ -1540,6 +1584,11 @@ TEST_F(TestIndexInKernel, FixedSizeBinary) { /*value_set=*/R"(["aaa", "bbb", "ccc"])", /*expected=*/R"([1, null, null, 0, 2, 0])", /*skip_nulls=*/true); + CheckIndexIn(fixed_size_binary(3), + /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", + /*value_set=*/R"(["aaa", "bbb", "ccc"])", + /*expected=*/R"([1, null, null, 0, 2, 0])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Duplicates in value_set CheckIndexIn(fixed_size_binary(3), @@ -1551,6 +1600,11 @@ TEST_F(TestIndexInKernel, FixedSizeBinary) { /*value_set=*/R"(["aaa", "aaa", null, null, "bbb", "bbb", "ccc"])", /*expected=*/R"([4, null, null, 0, 6, 0])", /*skip_nulls=*/true); + CheckIndexIn(fixed_size_binary(3), + /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", + /*value_set=*/R"(["aaa", "aaa", null, null, "bbb", "bbb", "ccc"])", + /*expected=*/R"([4, null, null, 0, 6, 0])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Empty input array CheckIndexIn(fixed_size_binary(5), R"([])", R"(["bbbbb", null, "aaaaa", "ccccc"])", @@ -1577,6 +1631,11 @@ TEST_F(TestIndexInKernel, MonthDayNanoInterval) { /*value_set=*/R"([null, [4, 5, 6], [5, -1, 5]])", /*expected=*/R"([2, 0, 1, 2, null])", /*skip_nulls=*/false); + CheckIndexIn(type, + /*input=*/R"([[5, -1, 5], null, [4, 5, 6], [5, -1, 5], [1, 2, 3]])", + /*value_set=*/R"([null, [4, 5, 6], [5, -1, 5]])", + /*expected=*/R"([2, 0, 1, 2, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); // Duplicates in value_set CheckIndexIn( @@ -1585,6 +1644,12 @@ TEST_F(TestIndexInKernel, MonthDayNanoInterval) { /*value_set=*/R"([null, null, [0, 0, 0], [0, 0, 0], [7, 8, 0], [7, 8, 0]])", /*expected=*/R"([4, 0, 2, 4, null])", /*skip_nulls=*/false); + CheckIndexIn( + type, + /*input=*/R"([[7, 8, 0], null, [0, 0, 0], [7, 8, 0], [0, 0, 1]])", + /*value_set=*/R"([null, null, [0, 0, 0], [0, 0, 0], [7, 8, 0], [7, 8, 0]])", + /*expected=*/R"([4, 0, 2, 4, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); } TEST_F(TestIndexInKernel, Decimal) { @@ -1599,6 +1664,16 @@ TEST_F(TestIndexInKernel, Decimal) { /*value_set=*/R"([null, "11", "12"])", /*expected=*/R"([2, null, 1, 2, null])", /*skip_nulls=*/true); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, "11", "12"])", + /*expected=*/R"([2, 0, 1, 2, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, "11", "12"])", + /*expected=*/R"([2, null, 1, 2, null])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIndexIn(type, /*input=*/R"(["12", null, "11", "12", "13"])", @@ -1610,6 +1685,16 @@ TEST_F(TestIndexInKernel, Decimal) { /*value_set=*/R"(["11", "12"])", /*expected=*/R"([1, null, 0, 1, null])", /*skip_nulls=*/true); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"(["11", "12"])", + /*expected=*/R"([1, null, 0, 1, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"(["11", "12"])", + /*expected=*/R"([1, null, 0, 1, null])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Duplicates in value_set CheckIndexIn(type, @@ -1627,6 +1712,21 @@ TEST_F(TestIndexInKernel, Decimal) { /*value_set=*/R"([null, "11", "12"])", /*expected=*/R"([2, 0, 1, 2, null])", /*skip_nulls=*/false); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, null, "11", "11", "12", "12"])", + /*expected=*/R"([4, 0, 2, 4, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, null, "11", "11", "12", "12"])", + /*expected=*/R"([4, null, 2, 4, null])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, "11", "12"])", + /*expected=*/R"([2, 0, 1, 2, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); CheckIndexIn( ArrayFromJSON(decimal256(3, 1), R"(["12.0", null, "11.0", "12.0", "13.0"])"), @@ -1650,6 +1750,20 @@ TEST_F(TestIndexInKernel, DictionaryArray) { /*value_set_json=*/"[4.1, 42, -1.0]", /*expected_json=*/"[2, 1, null, 0]", /*skip_nulls=*/false); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/R"(["A", "B", "C"])", + /*expected_json=*/"[1, 2, null, 0]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInDictionary(/*type=*/float32(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/"[4.1, -1.0, 42, 9.8]", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/"[4.1, 42, -1.0]", + /*expected_json=*/"[2, 1, null, 0]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); // With nulls and skip_nulls=false CheckIndexInDictionary(/*type=*/utf8(), @@ -1673,6 +1787,27 @@ TEST_F(TestIndexInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "B", "A"])", /*expected_json=*/"[null, null, null, 2, null]", /*skip_nulls=*/false); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[1, null, 3, 2, 1]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[3, null, 3, 2, 3]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[null, null, null, 2, null]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); // With nulls and skip_nulls=true CheckIndexInDictionary(/*type=*/utf8(), @@ -1696,6 +1831,27 @@ TEST_F(TestIndexInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "B", "A"])", /*expected_json=*/"[null, null, null, 2, null]", /*skip_nulls=*/true); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[1, null, null, 2, 1]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[null, null, null, 2, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[null, null, null, 2, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // With duplicates in value_set CheckIndexInDictionary(/*type=*/utf8(), @@ -1719,6 +1875,27 @@ TEST_F(TestIndexInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "C", "B", "B", "A", "A", null])", /*expected_json=*/"[null, null, null, 4, null]", /*skip_nulls=*/true); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/R"(["A", "A", "B", "B", "C", "C"])", + /*expected_json=*/"[2, 4, null, 0]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "B", "A", "A", null])", + /*expected_json=*/"[6, null, 6, 4, 6]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "B", "A", "A", null])", + /*expected_json=*/"[null, null, null, 4, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); } } @@ -1732,21 +1909,33 @@ TEST_F(TestIndexInKernel, ChunkedArrayInvoke) { CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/false); CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Null in value_set value_set = ChunkedArrayFromJSON(utf8(), {R"(["ghi", "def"])", R"([null, "abc"])"}); expected = ChunkedArrayFromJSON(int32(), {"[3, 1, 0, 3, null]", "[1, 2, 3, null]"}); CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/false); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); expected = ChunkedArrayFromJSON(int32(), {"[3, 1, 0, 3, null]", "[1, null, 3, null]"}); CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Duplicates in value_set value_set = ChunkedArrayFromJSON( utf8(), {R"(["ghi", "ghi", "def"])", R"(["def", null, null, "abc"])"}); expected = ChunkedArrayFromJSON(int32(), {"[6, 2, 0, 6, null]", "[2, 4, 6, null]"}); CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/false); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); expected = ChunkedArrayFromJSON(int32(), {"[6, 2, 0, 6, null]", "[2, null, 6, null]"}); CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); } TEST(TestSetLookup, DispatchBest) { From 8cfe97c20110d11c671f8ecae532e1ab70baee83 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 20 Sep 2023 23:40:47 +0800 Subject: [PATCH 40/43] CoercedDataMember --- cpp/src/arrow/compute/api_scalar.cc | 5 +++-- cpp/src/arrow/util/reflection_internal.h | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 0ee4d983227b9..ffb33f3b4edea 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -310,6 +310,7 @@ using ::arrow::internal::checked_cast; namespace internal { namespace { using ::arrow::internal::DataMember; +using ::arrow::internal::CoercedDataMember; static auto kArithmeticOptionsType = GetFunctionOptionsType( DataMember("check_overflow", &ArithmeticOptions::check_overflow)); static auto kAssumeTimezoneOptionsType = GetFunctionOptionsType( @@ -367,8 +368,8 @@ static auto kRoundToMultipleOptionsType = GetFunctionOptionsType( DataMember("value_set", &SetLookupOptions::value_set), - DataMember("skip_nulls", &SetLookupOptions::skip_nulls), - DataMember("null_matching_behavior", &SetLookupOptions::null_matching_behavior)); + CoercedDataMember("null_matching_behavior", &SetLookupOptions::null_matching_behavior, + &SetLookupOptions::GetNullMatchingBehavior)); static auto kSliceOptionsType = GetFunctionOptionsType( DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop), DataMember("step", &SliceOptions::step)); diff --git a/cpp/src/arrow/util/reflection_internal.h b/cpp/src/arrow/util/reflection_internal.h index d7de913bafd88..5d281a265ff71 100644 --- a/cpp/src/arrow/util/reflection_internal.h +++ b/cpp/src/arrow/util/reflection_internal.h @@ -71,6 +71,30 @@ constexpr DataMemberProperty DataMember(std::string_view name, return {name, ptr}; } +template +struct CoercedDataMemberProperty { + using Class = C; + using Type = T; + + constexpr Type get(const Class& obj) const { return (obj.*get_coerced_)(); } + + void set(Class* obj, Type value) const { (*obj).*ptr_for_set_ = std::move(value); } + + constexpr std::string_view name() const { return name_; } + + std::string_view name_; + Type Class::*ptr_for_set_; + Type (Class::*get_coerced_)() const; +}; + +template +constexpr CoercedDataMemberProperty CoercedDataMember(std::string_view name, + Type Class::*ptr, + Type (Class::*get)() + const) { + return {name, ptr, get}; +} + template struct PropertyTuple { template From 2c9fa9c31e464038cab4d647a388b60111988680 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 20 Sep 2023 23:48:56 +0800 Subject: [PATCH 41/43] lint --- cpp/src/arrow/compute/api_scalar.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index ffb33f3b4edea..eaec940556361 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -309,8 +309,8 @@ using ::arrow::internal::checked_cast; namespace internal { namespace { -using ::arrow::internal::DataMember; using ::arrow::internal::CoercedDataMember; +using ::arrow::internal::DataMember; static auto kArithmeticOptionsType = GetFunctionOptionsType( DataMember("check_overflow", &ArithmeticOptions::check_overflow)); static auto kAssumeTimezoneOptionsType = GetFunctionOptionsType( From 1b09661716a6c93f18f37b89d90e7283a79100ee Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Wed, 20 Sep 2023 23:55:52 +0800 Subject: [PATCH 42/43] expression test --- cpp/src/arrow/compute/expression_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index eb468da4240d3..44159e76600fb 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -263,9 +263,9 @@ TEST(Expression, ToString) { auto in_12 = call("index_in", {field_ref("beta")}, compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2]")}); - EXPECT_EQ(in_12.ToString(), - "index_in(beta, {value_set=int32:[\n 1,\n 2\n], skip_nulls=nullopt, " - "null_matching_behavior=MATCH})"); + EXPECT_EQ( + in_12.ToString(), + "index_in(beta, {value_set=int32:[\n 1,\n 2\n], null_matching_behavior=MATCH})"); EXPECT_EQ(and_(field_ref("a"), field_ref("b")).ToString(), "(a and b)"); EXPECT_EQ(or_(field_ref("a"), field_ref("b")).ToString(), "(a or b)"); From 38c1a04817ffa20e3baef1ed6312d7cf6501fc68 Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Thu, 21 Sep 2023 00:30:19 +0800 Subject: [PATCH 43/43] python expression --- python/pyarrow/_compute.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index bc1f5aa8ac16b..f552f1c9f4cb1 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2325,7 +2325,7 @@ cdef class Expression(_Weakrefable): 1, 2, 3 - ], skip_nulls=0, null_matching_behavior=MATCH})> + ], null_matching_behavior=MATCH})> """ def __init__(self):