Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datafusion/common/src/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ mod tests {
&[],
)
.expect_err("should've failed to find field");
let expected = "Schema error: No field named z. \
let expected = "Schema error: No field named z.\n\
Valid fields are t1.a, t1.b, t2.c, t2.d, t3.a, t3.b, t3.c, t3.d, t3.e.";
assert_eq!(err.strip_backtrace(), expected);

Expand Down
44 changes: 38 additions & 6 deletions datafusion/common/src/dfschema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1427,11 +1427,8 @@ mod tests {
let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?;
// lookup with unqualified name "t1.c0"
let err = schema.index_of_column(&col).unwrap_err();
let expected = "Schema error: No field named \"t1.c0\". \
Column names are case sensitive. \
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now this case sensitive hint only shows up, if the input expr has some case-insensitive match.

You can use double quotes to refer to the \"\"t1.c0\"\" column \
or set the datafusion.sql_parser.enable_ident_normalization configuration. \
Did you mean 't1.c0'?.";
let expected = "Schema error: No field named \"t1.c0\". Did you mean 't1.c0'?\n\
Valid fields are t1.c0, t1.c1.";
assert_eq!(err.strip_backtrace(), expected);
Ok(())
}
Expand All @@ -1449,12 +1446,47 @@ mod tests {

// lookup with unqualified name "t1.c0"
let err = schema.index_of_column(&col).unwrap_err();
let expected = "Schema error: No field named \"t1.c0\". \
let expected = "Schema error: No field named \"t1.c0\".\n\
Valid fields are t1.\"CapitalColumn\", t1.\"field.with.period\".";
assert_eq!(err.strip_backtrace(), expected);
Ok(())
}

#[test]
fn field_not_found_suggests_closest_field_name() -> Result<()> {
let schema = DFSchema::try_from(Schema::new(vec![
Field::new("abzz", DataType::Boolean, true),
Field::new("abcd", DataType::Boolean, true),
]))?;

let err = schema.field_with_unqualified_name("abc").unwrap_err();
let expected = "Schema error: No field named abc. Did you mean 'abcd'?\n\
Valid fields are abzz, abcd.";
assert_eq!(err.strip_backtrace(), expected);
Ok(())
}

#[test]
fn field_not_found_suggests_case_sensitive_qualified_field() -> Result<()> {
let schema = DFSchema::try_from_qualified_schema(
"hits",
&Schema::new(vec![
Field::new("WatchID", DataType::Boolean, true),
Field::new("URL", DataType::Boolean, true),
Field::new("URLHash", DataType::Boolean, true),
]),
)?;

let err = schema.field_with_unqualified_name("url").unwrap_err();
let expected = "Schema error: No field named url. Did you mean 'hits.\"URL\"'?\n\
Column names are case sensitive. \
You can use double quotes to refer to the hits.\"URL\" column \
or set the datafusion.sql_parser.enable_ident_normalization configuration.\n\
Valid fields are hits.\"WatchID\", hits.\"URL\", hits.\"URLHash\".";
assert_eq!(err.strip_backtrace(), expected);
Ok(())
}

#[test]
fn from_unqualified_schema() -> Result<()> {
let schema = DFSchema::try_from(test_schema_1())?;
Expand Down
112 changes: 86 additions & 26 deletions datafusion/common/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ use std::io;
use std::result;
use std::sync::Arc;

use crate::utils::datafusion_strsim::normalized_levenshtein;
use crate::utils::datafusion_strsim::{levenshtein, normalized_levenshtein};
use crate::utils::quote_identifier;
use crate::{Column, DFSchema, Diagnostic, TableReference};
use arrow::error::ArrowError;
Expand Down Expand Up @@ -198,51 +198,111 @@ pub enum SchemaError {
},
}

fn case_insensitive_field_match<'a>(
field: &Column,
valid_fields: &'a [Column],
) -> Option<&'a Column> {
let field_flat_name = field.flat_name();
let field_name_lower = field.name().to_lowercase();
let field_flat_name_lower = field_flat_name.to_lowercase();

valid_fields.iter().find(|valid_field| {
let valid_field_flat_name = valid_field.flat_name();
let valid_field_name_lower = valid_field.name().to_lowercase();
let valid_field_flat_name_lower = valid_field_flat_name.to_lowercase();

let name_differs_only_by_case = field_name_lower == valid_field_name_lower
&& field.name() != valid_field.name();
let flat_name_differs_only_by_case = field_flat_name_lower
== valid_field_flat_name_lower
&& field_flat_name != valid_field_flat_name;

name_differs_only_by_case || flat_name_differs_only_by_case
})
}

fn closest_valid_field<'a>(
field: &Column,
valid_fields: &'a [Column],
) -> Option<&'a Column> {
// Find the most similar valid field name.
let target_names = [
field.name().to_lowercase(),
field.flat_name().to_lowercase(),
];

let mut best_match: Option<(usize, usize, usize, &Column)> = None;
for (index, valid_field) in valid_fields.iter().enumerate() {
let valid_names = [
valid_field.name().to_lowercase(),
valid_field.flat_name().to_lowercase(),
];
for target in &target_names {
for valid_name in &valid_names {
let distance = levenshtein(target, valid_name);
let max_len = target.chars().count().max(valid_name.chars().count());
if max_len == 0 || distance * 2 > max_len {
continue;
}

let should_replace = best_match.is_none_or(
|(best_distance, best_max_len, best_index, _)| {
distance < best_distance
|| distance == best_distance
&& (max_len > best_max_len
|| max_len == best_max_len && index < best_index)
},
);
if should_replace {
best_match = Some((distance, max_len, index, valid_field));
}
}
}
}

best_match.map(|(_, _, _, valid_field)| valid_field)
}

impl Display for SchemaError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::FieldNotFound {
field,
valid_fields,
} => {
let closest_field = closest_valid_field(field, valid_fields);
let case_sensitive_match =
case_insensitive_field_match(field, valid_fields);

write!(f, "No field named {}", field.quoted_flat_name())?;
let lower_valid_fields = valid_fields
.iter()
.map(|column| column.flat_name().to_lowercase())
.collect::<Vec<String>>();

let valid_fields_names = valid_fields
.iter()
.map(|column| column.flat_name())
.collect::<Vec<String>>();
if lower_valid_fields.contains(&field.flat_name().to_lowercase()) {
if let Some(matched) = closest_field {
write!(f, ". Did you mean '{}'?", matched.quoted_flat_name())?;
} else {
write!(f, ".")?;
}

if let Some(case_sensitive_match) = case_sensitive_match {
write!(
f,
". Column names are case sensitive. You can use double quotes to refer to the \"{}\" column \
or set the datafusion.sql_parser.enable_ident_normalization configuration",
field.quoted_flat_name()
"\nColumn names are case sensitive. You can use double quotes to refer to the {} column \
or set the datafusion.sql_parser.enable_ident_normalization configuration.",
case_sensitive_match.quoted_flat_name()
)?;
}
let field_name = field.name();
if let Some(matched) = valid_fields_names
.iter()
.filter(|str| normalized_levenshtein(str, field_name) >= 0.5)
.collect::<Vec<&String>>()
.first()
{
write!(f, ". Did you mean '{matched}'?")?;
} else if !valid_fields.is_empty() {

if !valid_fields.is_empty() {
write!(
f,
". Valid fields are {}",
"\nValid fields are {}.",
valid_fields
.iter()
.map(|field| field.quoted_flat_name())
.collect::<Vec<String>>()
.join(", ")
)?;
)
} else {
Ok(())
}
write!(f, ".")
}
Self::DuplicateQualifiedField { qualifier, name } => {
write!(
Expand Down
13 changes: 11 additions & 2 deletions datafusion/core/tests/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1141,7 +1141,13 @@ async fn test_aggregate_name_collision() -> Result<()> {
// The select expr has the same display_name as the group_expr,
// but since they are different expressions, it should fail.
.expect_err("Expected error");
assert_snapshot!(df.strip_backtrace(), @r#"Schema error: No field named aggregate_test_100.c2. Valid fields are "aggregate_test_100.c2 + aggregate_test_100.c3"."#);
assert_snapshot!(
df.strip_backtrace(),
@r#"
Schema error: No field named aggregate_test_100.c2.
Valid fields are "aggregate_test_100.c2 + aggregate_test_100.c3".
"#
);

Ok(())
}
Expand Down Expand Up @@ -6305,7 +6311,10 @@ async fn test_alias_nested() -> Result<()> {
let select2 = df.select(vec![col("alias1.a")]);
assert_snapshot!(
select2.unwrap_err().strip_backtrace(),
@"Schema error: No field named alias1.a. Valid fields are alias2.a, alias2.b, alias2.one."
@r#"
Schema error: No field named alias1.a. Did you mean 'alias2.a'?
Valid fields are alias2.a, alias2.b, alias2.one.
"#
);
Ok(())
}
Expand Down
2 changes: 1 addition & 1 deletion datafusion/expr/src/expr_rewriter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ mod test {
normalize_col_with_schemas_and_ambiguity_check(expr, &[&schemas], &[])
.unwrap_err()
.strip_backtrace();
let expected = "Schema error: No field named b. \
let expected = "Schema error: No field named b.\n\
Valid fields are \"tableA\".a.";
assert_eq!(error, expected);
}
Expand Down
7 changes: 5 additions & 2 deletions datafusion/sql/tests/sql_integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,7 @@ fn plan_insert_no_target_columns() {
)]
#[case::non_existing_column(
"INSERT INTO test_decimal (nonexistent, price) VALUES (1, 2), (4, 5)",
"Schema error: No field named nonexistent. \
"Schema error: No field named nonexistent.\n\
Valid fields are id, price."
)]
#[case::target_column_count_mismatch(
Expand Down Expand Up @@ -1681,7 +1681,10 @@ fn select_simple_aggregate_with_groupby_and_column_in_group_by_does_not_exist()

assert_snapshot!(
err.strip_backtrace(),
@r#"Schema error: No field named doesnotexist. Valid fields are "sum(person.age)", person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person."😀"."#
@r#"
Schema error: No field named doesnotexist.
Valid fields are "sum(person.age)", person.id, person.first_name, person.last_name, person.age, person.state, person.salary, person.birth_date, person."😀".
"#
);
}

Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/delete.slt
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ physical_plan


# Deleting by columns that do not exist returns an error
query error DataFusion error: Schema error: No field named e. Valid fields are t1.a, t1.b, t1.c, t1.d.
query error DataFusion error: Schema error: No field named e\.\nValid fields are t1.a, t1.b, t1.c, t1.d.
explain delete from t1 where e = 1;


Expand Down
6 changes: 3 additions & 3 deletions datafusion/sqllogictest/test_files/errors.slt
Original file line number Diff line number Diff line change
Expand Up @@ -180,13 +180,13 @@ SELECT DISTINCT - 84 FROM tab0 AS cor0 WHERE NOT + 96 / + col1 <= NULL GROUP BY
statement ok
create table a(timestamp int, birthday int, ts int, tokens int, amp int, staamp int);

query error DataFusion error: Schema error: No field named timetamp\. Did you mean 'a\.timestamp'\?\.
query error DataFusion error: Schema error: No field named timetamp\. Did you mean 'a\.timestamp'\?\nValid fields are a\.timestamp, a\.birthday, a\.ts, a\.tokens, a\.amp, a\.staamp\.
select timetamp from a;

query error DataFusion error: Schema error: No field named dadsada\. Valid fields are a\.timestamp, a\.birthday, a\.ts, a\.tokens, a\.amp, a\.staamp\.
query error DataFusion error: Schema error: No field named dadsada\.\nValid fields are a\.timestamp, a\.birthday, a\.ts, a\.tokens, a\.amp, a\.staamp\.
select dadsada from a;

query error DataFusion error: Schema error: No field named ammp\. Did you mean 'a\.amp'\?\.
query error DataFusion error: Schema error: No field named ammp\. Did you mean 'a\.amp'\?\nValid fields are a\.timestamp, a\.birthday, a\.ts, a\.tokens, a\.amp, a\.staamp\.
select ammp from a;

statement ok
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/ident_normalization.slt
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ A Int64 NO

# Expect error as 'a' is not a column -- "A" is and the identifiers
# are not normalized
query error DataFusion error: Schema error: No field named a\. Valid fields are x\."A"\.
query error DataFusion error: Schema error: No field named a\. Did you mean 'x\."A"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the x\."A" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are x\."A"\.
select a from x;

# should work (note the uppercase 'A')
Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/identifiers.slt
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,16 @@ drop table case_insensitive_test
statement ok
CREATE TABLE test("Column1" string) AS VALUES ('content1');

statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\.
statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\."Column1"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the test\."Column1" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are test\."Column1"\.
SELECT COLumn1 from test

statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\.
statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\."Column1"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the test\."Column1" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are test\."Column1"\.
SELECT Column1 from test

statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\.
statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\."Column1"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the test\."Column1" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are test\."Column1"\.
SELECT column1 from test

statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\.Column1'\?\.
statement error DataFusion error: Schema error: No field named column1\. Did you mean 'test\."Column1"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the test\."Column1" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are test\."Column1"\.
SELECT "column1" from test

statement ok
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/join.slt.part
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ statement ok
set datafusion.execution.batch_size = 4096;

# left semi with wrong where clause
query error DataFusion error: Schema error: No field named t2\.t2_id\. Did you mean 't1\.t1_id'\?\.
query error DataFusion error: Schema error: No field named t2\.t2_id\. Did you mean 't1\.t1_id'\?\nValid fields are t1\.t1_id, t1\.t1_name, t1\.t1_int\.
SELECT t1.t1_id, t1.t1_name, t1.t1_int
FROM t1
LEFT SEMI JOIN t2 ON t1.t1_id = t2.t2_id
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/references.slt
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ CREATE TABLE test("f.c1" TEXT, "test.c2" INT, "...." INT) AS VALUES
('foobar', 2, 20),
('foobaz', 3, 30);

query error DataFusion error: Schema error: No field named f1\.c1\. Valid fields are test\."f\.c1", test\."test\.c2", test\."\.\.\.\."\.
query error DataFusion error: Schema error: No field named f1\.c1\. Did you mean 'test\."f\.c1"'\?\nValid fields are test\."f\.c1", test\."test\.c2", test\."\.\.\.\."\.
SELECT f1.c1 FROM test;

query T
Expand Down
6 changes: 3 additions & 3 deletions datafusion/sqllogictest/test_files/select.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1181,7 +1181,7 @@ SELECT * FROM empty_table
statement ok
CREATE TABLE case_sensitive_table("INT32" int) AS VALUES (1), (2), (3), (4), (5);

statement error DataFusion error: Schema error: No field named int32\. Valid fields are case_sensitive_table\."INT32"\.
statement error DataFusion error: Schema error: No field named int32\. Did you mean 'case_sensitive_table\."INT32"'\?\nColumn names are case sensitive\. You can use double quotes to refer to the case_sensitive_table\."INT32" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\.\nValid fields are case_sensitive_table\."INT32"\.
select "int32" from case_sensitive_table

query I
Expand Down Expand Up @@ -1823,7 +1823,7 @@ select a + b from (select 1 as a, 2 as b, 1 as "a + b");
3

# Can't reference an output column by expression over projection.
query error DataFusion error: Schema error: No field named a\. Valid fields are "a \+ Int64\(1\)"\.
query error DataFusion error: Schema error: No field named a\.\nValid fields are "a \+ Int64\(1\)"\.
select a + 1 from (select a+1 from (select 1 as a));

query I
Expand Down Expand Up @@ -1861,7 +1861,7 @@ statement ok
DROP TABLE test;

# Can't reference an unqualified column by a qualified name
query error DataFusion error: Schema error: No field named t1\.v1\. Column names are case sensitive\. You can use double quotes to refer to the "t1\.v1" column or set the datafusion\.sql_parser\.enable_ident_normalization configuration\. Valid fields are "t1\.v1"\.
query error DataFusion error: Schema error: No field named t1\.v1\. Did you mean '"t1\.v1"'\?\nValid fields are "t1\.v1"\.
SELECT t1.v1 FROM (SELECT 1 AS "t1.v1");

# Test issue: https://github.com/apache/datafusion/issues/14124
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/union_by_name.slt
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ NULL 5

# Ambiguous name

statement error DataFusion error: Schema error: No field named x. Valid fields are a, b.
statement error DataFusion error: Schema error: No field named x\.\nValid fields are a, b.
SELECT x AS a FROM t1 UNION BY NAME SELECT x AS b FROM t1 ORDER BY x;

query II
Expand Down
Loading