Skip to content

Commit

Permalink
ARROW-8216: [C++][Compute] Filter out nulls by default
Browse files Browse the repository at this point in the history
In C++ and R, Filter's default is now to drop from output when a selection slot is null. In Python, `Array.filter` now has a keyword argument `drop_nulls=True`. In R, since the expected behavior is the opposite, the argument is `keep_na = TRUE`. Old behavior is preserved in c_glib; I'm not familiar enough with that binding to add an optional parameter in an idiomatic fashion (@mrkn ?)

Closes #6732 from bkietz/8216-Filtering-returns-all-mis

Lead-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Co-authored-by: Kenta Murata <mrkn@mrkn.jp>
Co-authored-by: Neal Richardson <neal.p.richardson@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
  • Loading branch information
3 people committed Apr 2, 2020
1 parent cf18a62 commit d14e778
Show file tree
Hide file tree
Showing 33 changed files with 1,073 additions and 540 deletions.
318 changes: 286 additions & 32 deletions c_glib/arrow-glib/compute.cpp

Large diffs are not rendered by default.

37 changes: 37 additions & 0 deletions c_glib/arrow-glib/compute.h
Expand Up @@ -68,6 +68,37 @@ GArrowCountOptions *
garrow_count_options_new(void);


/**
* GArrowFilterNullSelectionBehavior:
* @GARROW_FILTER_NULL_SELECTION_DROP:
* Filtered value will be removed in the output.
* @GARROW_FILTER_NULL_SELECTION_EMIT_NULL:
* Filtered value will be null in the output.
*
* They are corresponding to
* `arrow::compute::FilterOptions::NullSelectionBehavior` values.
*/
typedef enum {
GARROW_FILTER_NULL_SELECTION_DROP,
GARROW_FILTER_NULL_SELECTION_EMIT_NULL,
} GArrowFilterNullSelectionBehavior;

#define GARROW_TYPE_FILTER_OPTIONS (garrow_filter_options_get_type())
G_DECLARE_DERIVABLE_TYPE(GArrowFilterOptions,
garrow_filter_options,
GARROW,
FILTER_OPTIONS,
GObject)
struct _GArrowFilterOptionsClass
{
GObjectClass parent_class;
};

GARROW_AVAILABLE_IN_1_0
GArrowFilterOptions *
garrow_filter_options_new(void);


#define GARROW_TYPE_TAKE_OPTIONS (garrow_take_options_get_type())
G_DECLARE_DERIVABLE_TYPE(GArrowTakeOptions,
garrow_take_options,
Expand Down Expand Up @@ -291,6 +322,7 @@ GARROW_AVAILABLE_IN_0_15
GArrowArray *
garrow_array_filter(GArrowArray *array,
GArrowBooleanArray *filter,
GArrowFilterOptions *options,
GError **error);
GARROW_AVAILABLE_IN_0_15
GArrowBooleanArray *
Expand All @@ -310,26 +342,31 @@ GARROW_AVAILABLE_IN_1_0
GArrowTable *
garrow_table_filter(GArrowTable *table,
GArrowBooleanArray *filter,
GArrowFilterOptions *options,
GError **error);
GARROW_AVAILABLE_IN_1_0
GArrowTable *
garrow_table_filter_chunked_array(GArrowTable *table,
GArrowChunkedArray *filter,
GArrowFilterOptions *options,
GError **error);
GARROW_AVAILABLE_IN_1_0
GArrowChunkedArray *
garrow_chunked_array_filter(GArrowChunkedArray *chunked_array,
GArrowBooleanArray *filter,
GArrowFilterOptions *options,
GError **error);
GARROW_AVAILABLE_IN_1_0
GArrowChunkedArray *
garrow_chunked_array_filter_chunked_array(GArrowChunkedArray *chunked_array,
GArrowChunkedArray *filter,
GArrowFilterOptions *options,
GError **error);
GARROW_AVAILABLE_IN_1_0
GArrowRecordBatch *
garrow_record_batch_filter(GArrowRecordBatch *record_batch,
GArrowBooleanArray *filter,
GArrowFilterOptions *options,
GError **error);

G_END_DECLS
3 changes: 3 additions & 0 deletions c_glib/arrow-glib/compute.hpp
Expand Up @@ -31,6 +31,9 @@ garrow_count_options_new_raw(arrow::compute::CountOptions *arrow_count_options);
arrow::compute::CountOptions *
garrow_count_options_get_raw(GArrowCountOptions *count_options);

arrow::compute::FilterOptions *
garrow_filter_options_get_raw(GArrowFilterOptions *filter_options);

arrow::compute::TakeOptions *
garrow_take_options_get_raw(GArrowTakeOptions *take_options);

Expand Down
7 changes: 7 additions & 0 deletions c_glib/test/test-compare.rb
Expand Up @@ -22,6 +22,13 @@ def setup
@options = Arrow::CompareOptions.new
end

sub_test_case("CompareOptions") do
def test_default_operator
assert_equal(Arrow::CompareOperator::EQUAL,
@options.operator)
end
end

sub_test_case("operator") do
def test_equal
@options.operator = :equal
Expand Down
7 changes: 7 additions & 0 deletions c_glib/test/test-count.rb
Expand Up @@ -19,6 +19,13 @@ class TestCount < Test::Unit::TestCase
include Helper::Buildable
include Helper::Omittable

sub_test_case("CountOptions") do
def test_default_mode
assert_equal(Arrow::CountMode::ALL,
Arrow::CountOptions.new.mode)
end
end

sub_test_case("mode") do
def test_default
assert_equal(2, build_int32_array([1, nil, 3]).count)
Expand Down
102 changes: 95 additions & 7 deletions c_glib/test/test-filter.rb
Expand Up @@ -18,11 +18,28 @@
class TestFilter < Test::Unit::TestCase
include Helper::Buildable

sub_test_case("FilterOptions") do
def test_default_null_selection_behavior
assert_equal(Arrow::FilterNullSelectionBehavior::DROP,
Arrow::FilterOptions.new.null_selection_behavior)
end
end

sub_test_case("Array") do
def setup
@filter = build_boolean_array([false, true, true, nil])
end

def test_filter
filter = build_boolean_array([false, true, true, nil])
assert_equal(build_int16_array([1, 0]),
build_int16_array([0, 1, 0, 2]).filter(@filter))
end

def test_filter_emit_null
options = Arrow::FilterOptions.new
options.null_selection_behavior = :emit_null
assert_equal(build_int16_array([1, 0, nil]),
build_int16_array([0, 1, 0, 2]).filter(filter))
build_int16_array([0, 1, 0, 2]).filter(@filter, options))
end

def test_invalid_array_length
Expand All @@ -48,17 +65,45 @@ def setup
end

def test_filter
filter = build_boolean_array([false, true, nil])
arrays = [
build_boolean_array([false]),
build_boolean_array([true]),
]
filtered_table = Arrow::Table.new(@schema, arrays)
assert_equal(filtered_table,
@table.filter(filter))
end

def test_filter_emit_null
filter = build_boolean_array([false, true, nil])
arrays = [
build_boolean_array([false, nil]),
build_boolean_array([true, nil]),
]
filtered_table = Arrow::Table.new(@schema, arrays)
options = Arrow::FilterOptions.new
options.null_selection_behavior = :emit_null
assert_equal(filtered_table,
@table.filter(filter))
@table.filter(filter, options))
end

def test_filter_chunked_array
chunks = [
build_boolean_array([false]),
build_boolean_array([true, nil]),
]
filter = Arrow::ChunkedArray.new(chunks)
arrays = [
build_boolean_array([false]),
build_boolean_array([true]),
]
filtered_table = Arrow::Table.new(@schema, arrays)
assert_equal(filtered_table,
@table.filter_chunked_array(filter))
end

def test_filter_chunked_array_emit_null
chunks = [
build_boolean_array([false]),
build_boolean_array([true, nil]),
Expand All @@ -69,8 +114,10 @@ def test_filter_chunked_array
build_boolean_array([true, nil]),
]
filtered_table = Arrow::Table.new(@schema, arrays)
options = Arrow::FilterOptions.new
options.null_selection_behavior = :emit_null
assert_equal(filtered_table,
@table.filter_chunked_array(filter))
@table.filter_chunked_array(filter, options))
end

def test_invalid_array_length
Expand All @@ -94,13 +141,25 @@ def test_filter
filter = build_boolean_array([false, true, nil])
chunks = [
build_boolean_array([false]),
build_boolean_array([nil]),
]
filtered_chunked_array = Arrow::ChunkedArray.new(chunks)
assert_equal(filtered_chunked_array,
@chunked_array.filter(filter))
end

def test_filter_emit_null
filter = build_boolean_array([false, true, nil])
chunks = [
build_boolean_array([false]),
build_boolean_array([nil]),
]
filtered_chunked_array = Arrow::ChunkedArray.new(chunks)
options = Arrow::FilterOptions.new
options.null_selection_behavior = :emit_null
assert_equal(filtered_chunked_array,
@chunked_array.filter(filter, options))
end

def test_filter_chunked_array
chunks = [
build_boolean_array([false]),
Expand All @@ -109,13 +168,29 @@ def test_filter_chunked_array
filter = Arrow::ChunkedArray.new(chunks)
filtered_chunks = [
build_boolean_array([false]),
build_boolean_array([nil]),
]
filtered_chunked_array = Arrow::ChunkedArray.new(filtered_chunks)
assert_equal(filtered_chunked_array,
@chunked_array.filter_chunked_array(filter))
end

def test_filter_chunked_array_emit_null
chunks = [
build_boolean_array([false]),
build_boolean_array([true, nil]),
]
filter = Arrow::ChunkedArray.new(chunks)
filtered_chunks = [
build_boolean_array([false]),
build_boolean_array([nil]),
]
filtered_chunked_array = Arrow::ChunkedArray.new(filtered_chunks)
options = Arrow::FilterOptions.new
options.null_selection_behavior = :emit_null
assert_equal(filtered_chunked_array,
@chunked_array.filter_chunked_array(filter, options))
end

def test_invalid_array_length
filter = build_boolean_array([false, true, true, false])
assert_raise(Arrow::Error::Invalid) do
Expand All @@ -139,14 +214,27 @@ def setup
end

def test_filter
filter = build_boolean_array([false, true, nil])
columns = [
build_boolean_array([false]),
build_boolean_array([true]),
]
filtered_record_batch = Arrow::RecordBatch.new(@schema, 1, columns)
assert_equal(filtered_record_batch,
@record_batch.filter(filter))
end

def test_filter_emit_null
filter = build_boolean_array([false, true, nil])
columns = [
build_boolean_array([false, nil]),
build_boolean_array([true, nil]),
]
filtered_record_batch = Arrow::RecordBatch.new(@schema, 2, columns)
options = Arrow::FilterOptions.new
options.null_selection_behavior = :emit_null
assert_equal(filtered_record_batch,
@record_batch.filter(filter))
@record_batch.filter(filter, options))
end

def test_invalid_array_length
Expand Down
52 changes: 26 additions & 26 deletions cpp/src/arrow/array/diff_test.cc
Expand Up @@ -117,6 +117,22 @@ class DiffTest : public ::testing::Test {
ASSERT_ARRAYS_EQUAL(*ArrayFromJSON(int64(), run_lengths_json), *run_lengths_);
}

void BaseAndTargetFromRandomFilter(std::shared_ptr<Array> values,
double filter_probability) {
compute::Datum out_datum, base_filter, target_filter;
do {
base_filter = this->rng_.Boolean(values->length(), filter_probability, 0.0);
target_filter = this->rng_.Boolean(values->length(), filter_probability, 0.0);
} while (base_filter.Equals(target_filter));

ASSERT_OK(compute::Filter(&ctx_, values, base_filter, {}, &out_datum));
base_ = out_datum.make_array();

ASSERT_OK(compute::Filter(&ctx_, values, target_filter, {}, &out_datum));
target_ = out_datum.make_array();
}

compute::FunctionContext ctx_;
random::RandomArrayGenerator rng_;
std::shared_ptr<StructArray> edits_;
std::shared_ptr<Array> base_, target_;
Expand Down Expand Up @@ -210,15 +226,10 @@ TYPED_TEST(DiffTestWithNumeric, Basics) {
}

TEST_F(DiffTest, CompareRandomInt64) {
compute::FunctionContext ctx;
for (auto null_probability : {0.0, 0.25}) {
auto values = this->rng_.Int64(1 << 10, 0, 127, null_probability);
for (const double filter_probability : {0.99, 0.75, 0.5}) {
auto filter_1 = this->rng_.Boolean(values->length(), filter_probability, 0.0);
auto filter_2 = this->rng_.Boolean(values->length(), filter_probability, 0.0);

ASSERT_OK(compute::Filter(&ctx, *values, *filter_1, &this->base_));
ASSERT_OK(compute::Filter(&ctx, *values, *filter_2, &this->target_));
this->BaseAndTargetFromRandomFilter(values, filter_probability);

std::stringstream formatted;
this->DoDiffAndFormat(&formatted);
Expand All @@ -231,15 +242,10 @@ TEST_F(DiffTest, CompareRandomInt64) {
}

TEST_F(DiffTest, CompareRandomStrings) {
compute::FunctionContext ctx;
for (auto null_probability : {0.0, 0.25}) {
auto values = this->rng_.StringWithRepeats(1 << 10, 1 << 8, 0, 32, null_probability);
for (const double filter_probability : {0.99, 0.75, 0.5}) {
auto filter_1 = this->rng_.Boolean(values->length(), filter_probability, 0.0);
auto filter_2 = this->rng_.Boolean(values->length(), filter_probability, 0.0);

ASSERT_OK(compute::Filter(&ctx, *values, *filter_1, &this->base_));
ASSERT_OK(compute::Filter(&ctx, *values, *filter_2, &this->target_));
this->BaseAndTargetFromRandomFilter(values, filter_probability);

std::stringstream formatted;
this->DoDiffAndFormat(&formatted);
Expand Down Expand Up @@ -614,21 +620,15 @@ TEST_F(DiffTest, CompareRandomStruct) {
auto int32_values = this->rng_.Int32(length, 0, 127, null_probability);
auto utf8_values = this->rng_.String(length, 0, 16, null_probability);
for (const double filter_probability : {0.9999, 0.75}) {
std::shared_ptr<Array> int32_base, int32_target, utf8_base, utf8_target;
ASSERT_OK(compute::Filter(&ctx, *int32_values,
*this->rng_.Boolean(length, filter_probability, 0.0),
&int32_base));
ASSERT_OK(compute::Filter(&ctx, *utf8_values,
*this->rng_.Boolean(length, filter_probability, 0.0),
&utf8_base));
MakeSameLength(&int32_base, &utf8_base);
this->BaseAndTargetFromRandomFilter(int32_values, filter_probability);
auto int32_base = this->base_;
auto int32_target = this->base_;

ASSERT_OK(compute::Filter(&ctx, *int32_values,
*this->rng_.Boolean(length, filter_probability, 0.0),
&int32_target));
ASSERT_OK(compute::Filter(&ctx, *utf8_values,
*this->rng_.Boolean(length, filter_probability, 0.0),
&utf8_target));
this->BaseAndTargetFromRandomFilter(utf8_values, filter_probability);
auto utf8_base = this->base_;
auto utf8_target = this->base_;

MakeSameLength(&int32_base, &utf8_base);
MakeSameLength(&int32_target, &utf8_target);

auto type = struct_({field("i", int32()), field("s", utf8())});
Expand Down
13 changes: 13 additions & 0 deletions cpp/src/arrow/compute/kernel.h
Expand Up @@ -219,6 +219,19 @@ struct ARROW_EXPORT Datum {
return kUnknownLength;
}

/// \brief The array chunks of the variant, if any
///
/// \return empty if not arraylike
ArrayVector chunks() const {
if (!this->is_arraylike()) {
return {};
}
if (this->is_array()) {
return {this->make_array()};
}
return this->chunked_array()->chunks();
}

bool Equals(const Datum& other) const {
if (this->kind() != other.kind()) return false;

Expand Down

0 comments on commit d14e778

Please sign in to comment.