Skip to content

Commit

Permalink
ARROW-7781: [C++] Improve message when referencing a missing field
Browse files Browse the repository at this point in the history
- The failure now indicate which field name is causing the issue.
- Add the `CanReferenceFieldsByNames` in Schema and relevant tests.

Closes #6421 from fsaintjacques/ARROW-7781-missing-filter-field-message and squashes the following commits:

461d663 <François Saint-Jacques> ARROW-7781:  Improve message when referencing a missing field

Authored-by: François Saint-Jacques <fsaintjacques@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
  • Loading branch information
fsaintjacques authored and kszucs committed Feb 14, 2020
1 parent d4e3898 commit 9392531
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 15 deletions.
1 change: 1 addition & 0 deletions cpp/src/arrow/dataset/filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1004,6 +1004,7 @@ struct InsertImplicitCastsImpl {

Result<std::shared_ptr<Expression>> InsertImplicitCasts(const Expression& expr,
const Schema& schema) {
RETURN_NOT_OK(schema.CanReferenceFieldsByNames(FieldsInExpression(expr)));
return VisitExpression(expr, InsertImplicitCastsImpl{schema});
}

Expand Down
4 changes: 3 additions & 1 deletion cpp/src/arrow/dataset/filter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,9 @@ TEST_F(ExpressionsTest, ImplicitCast) {
auto set_int32 = ArrayFromJSON(int32(), R"([1, 2, 3])");
ASSERT_EQ(E{filter}, E{"a"_.In(set_int32)});

ASSERT_RAISES(Invalid, InsertImplicitCasts("nope"_ == 0.0, *schema_));
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
testing::HasSubstr("Field named 'nope' not found"),
InsertImplicitCasts("nope"_ == 0.0, *schema_));
}

TEST_F(FilterTest, ImplicitCast) {
Expand Down
16 changes: 2 additions & 14 deletions cpp/src/arrow/dataset/scanner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -112,27 +112,15 @@ ScannerBuilder::ScannerBuilder(std::shared_ptr<Dataset> dataset,
options_(ScanOptions::Make(dataset_->schema())),
context_(std::move(context)) {}

Status EnsureColumnsInSchema(const std::shared_ptr<Schema>& schema,
const std::vector<std::string>& columns) {
for (const auto& column : columns) {
if (schema->GetFieldByName(column) == nullptr) {
return Status::Invalid("Requested column ", column,
" not found in dataset's schema.");
}
}

return Status::OK();
}

Status ScannerBuilder::Project(const std::vector<std::string>& columns) {
RETURN_NOT_OK(EnsureColumnsInSchema(schema(), columns));
RETURN_NOT_OK(schema()->CanReferenceFieldsByNames(columns));
has_projection_ = true;
project_columns_ = columns;
return Status::OK();
}

Status ScannerBuilder::Filter(std::shared_ptr<Expression> filter) {
RETURN_NOT_OK(EnsureColumnsInSchema(schema(), FieldsInExpression(*filter)));
RETURN_NOT_OK(schema()->CanReferenceFieldsByNames(FieldsInExpression(*filter)));
RETURN_NOT_OK(filter->Validate(*schema()).status());
options_->filter = std::move(filter);
return Status::OK();
Expand Down
11 changes: 11 additions & 0 deletions cpp/src/arrow/type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,17 @@ std::vector<int> Schema::GetAllFieldIndices(const std::string& name) const {
return result;
}

Status Schema::CanReferenceFieldsByNames(const std::vector<std::string>& names) const {
for (const auto& name : names) {
if (GetFieldByName(name) == nullptr) {
return Status::Invalid("Field named '", name,
"' not found or not unique in the schema.");
}
}

return Status::OK();
}

std::vector<std::shared_ptr<Field>> Schema::GetAllFieldsByName(
const std::string& name) const {
std::vector<std::shared_ptr<Field>> result;
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -1442,6 +1442,9 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
/// Return the indices of all fields having this name
std::vector<int> GetAllFieldIndices(const std::string& name) const;

/// Indicate if fields named `names` can be found unambiguously in the schema.
Status CanReferenceFieldsByNames(const std::vector<std::string>& names) const;

/// \brief The custom key-value metadata, if any
///
/// \return metadata may be null
Expand Down
21 changes: 21 additions & 0 deletions cpp/src/arrow/type_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,27 @@ TEST_F(TestSchema, GetFieldDuplicates) {
ASSERT_EQ(results.size(), 0);
}

TEST_F(TestSchema, CanReferenceFieldsByNames) {
auto f0 = field("f0", int32());
auto f1 = field("f1", uint8(), false);
auto f2 = field("f2", utf8());
auto f3 = field("f1", list(int16()));

auto schema = ::arrow::schema({f0, f1, f2, f3});

ASSERT_OK(schema->CanReferenceFieldsByNames({"f0", "f2"}));
ASSERT_OK(schema->CanReferenceFieldsByNames({"f2", "f0"}));

// Not found
ASSERT_RAISES(Invalid, schema->CanReferenceFieldsByNames({"nope"}));
ASSERT_RAISES(Invalid, schema->CanReferenceFieldsByNames({"f0", "nope"}));
// Duplicates
ASSERT_RAISES(Invalid, schema->CanReferenceFieldsByNames({"f1"}));
ASSERT_RAISES(Invalid, schema->CanReferenceFieldsByNames({"f0", "f1"}));
// Both
ASSERT_RAISES(Invalid, schema->CanReferenceFieldsByNames({"f0", "f1", "nope"}));
}

TEST_F(TestSchema, TestMetadataConstruction) {
auto metadata0 = key_value_metadata({{"foo", "bar"}, {"bizz", "buzz"}});
auto metadata1 = key_value_metadata({{"foo", "baz"}});
Expand Down

0 comments on commit 9392531

Please sign in to comment.