-
Notifications
You must be signed in to change notification settings - Fork 1.8k
JOIN conditions are order dependent #778
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1717,15 +1717,40 @@ fn create_case_context() -> Result<ExecutionContext> { | |
| #[tokio::test] | ||
| async fn equijoin() -> Result<()> { | ||
| let mut ctx = create_join_context("t1_id", "t2_id")?; | ||
| let sql = | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; | ||
| let actual = execute(&mut ctx, sql).await; | ||
| let equivalent_sql = [ | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t1_id = t2_id ORDER BY t1_id", | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t2_id = t1_id ORDER BY t1_id", | ||
| ]; | ||
| let expected = vec![ | ||
| vec!["11", "a", "z"], | ||
| vec!["22", "b", "y"], | ||
| vec!["44", "d", "x"], | ||
| ]; | ||
| assert_eq!(expected, actual); | ||
| for sql in equivalent_sql.iter() { | ||
| let actual = execute(&mut ctx, sql).await; | ||
| assert_eq!(expected, actual); | ||
| } | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn equijoin_multiple_condition_ordering() -> Result<()> { | ||
| let mut ctx = create_join_context("t1_id", "t2_id")?; | ||
| let equivalent_sql = [ | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t1_id = t2_id AND t1_name <> t2_name ORDER BY t1_id", | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t1_id = t2_id AND t2_name <> t1_name ORDER BY t1_id", | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t2_id = t1_id AND t1_name <> t2_name ORDER BY t1_id", | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t2_id = t1_id AND t2_name <> t1_name ORDER BY t1_id", | ||
| ]; | ||
| let expected = vec![ | ||
| vec!["11", "a", "z"], | ||
| vec!["22", "b", "y"], | ||
| vec!["44", "d", "x"], | ||
| ]; | ||
| for sql in equivalent_sql.iter() { | ||
| let actual = execute(&mut ctx, sql).await; | ||
| assert_eq!(expected, actual); | ||
| } | ||
| Ok(()) | ||
| } | ||
|
|
||
|
|
@@ -1754,51 +1779,70 @@ async fn equijoin_and_unsupported_condition() -> Result<()> { | |
| #[tokio::test] | ||
| async fn left_join() -> Result<()> { | ||
| let mut ctx = create_join_context("t1_id", "t2_id")?; | ||
| let sql = "SELECT t1_id, t1_name, t2_name FROM t1 LEFT JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; | ||
| let actual = execute(&mut ctx, sql).await; | ||
| let equivalent_sql = [ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if this test case and those below it add any additional coverage compared to async fn equijoin_multiple_condition_ordering() -> Result<()> {Like are there bugs that would cause
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My main objective was to ensure nothing specific to equijoin ( |
||
| "SELECT t1_id, t1_name, t2_name FROM t1 LEFT JOIN t2 ON t1_id = t2_id ORDER BY t1_id", | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 LEFT JOIN t2 ON t2_id = t1_id ORDER BY t1_id", | ||
| ]; | ||
| let expected = vec![ | ||
| vec!["11", "a", "z"], | ||
| vec!["22", "b", "y"], | ||
| vec!["33", "c", "NULL"], | ||
| vec!["44", "d", "x"], | ||
| ]; | ||
| assert_eq!(expected, actual); | ||
| for sql in equivalent_sql.iter() { | ||
| let actual = execute(&mut ctx, sql).await; | ||
| assert_eq!(expected, actual); | ||
| } | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn right_join() -> Result<()> { | ||
| let mut ctx = create_join_context("t1_id", "t2_id")?; | ||
| let sql = | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 RIGHT JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; | ||
| let actual = execute(&mut ctx, sql).await; | ||
| let equivalent_sql = [ | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 RIGHT JOIN t2 ON t1_id = t2_id ORDER BY t1_id", | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 RIGHT JOIN t2 ON t2_id = t1_id ORDER BY t1_id" | ||
| ]; | ||
| let expected = vec![ | ||
| vec!["NULL", "NULL", "w"], | ||
| vec!["11", "a", "z"], | ||
| vec!["22", "b", "y"], | ||
| vec!["44", "d", "x"], | ||
| ]; | ||
| assert_eq!(expected, actual); | ||
| for sql in equivalent_sql.iter() { | ||
| let actual = execute(&mut ctx, sql).await; | ||
| assert_eq!(expected, actual); | ||
| } | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn full_join() -> Result<()> { | ||
| let mut ctx = create_join_context("t1_id", "t2_id")?; | ||
| let sql = "SELECT t1_id, t1_name, t2_name FROM t1 FULL JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; | ||
| let actual = execute(&mut ctx, sql).await; | ||
| let equivalent_sql = [ | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 FULL JOIN t2 ON t1_id = t2_id ORDER BY t1_id", | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 FULL JOIN t2 ON t2_id = t1_id ORDER BY t1_id", | ||
| ]; | ||
| let expected = vec![ | ||
| vec!["NULL", "NULL", "w"], | ||
| vec!["11", "a", "z"], | ||
| vec!["22", "b", "y"], | ||
| vec!["33", "c", "NULL"], | ||
| vec!["44", "d", "x"], | ||
| ]; | ||
| assert_eq!(expected, actual); | ||
| for sql in equivalent_sql.iter() { | ||
| let actual = execute(&mut ctx, sql).await; | ||
| assert_eq!(expected, actual); | ||
| } | ||
|
|
||
| let sql = "SELECT t1_id, t1_name, t2_name FROM t1 FULL OUTER JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; | ||
| let actual = execute(&mut ctx, sql).await; | ||
| assert_eq!(expected, actual); | ||
| let equivalent_sql = [ | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 FULL OUTER JOIN t2 ON t1_id = t2_id ORDER BY t1_id", | ||
| "SELECT t1_id, t1_name, t2_name FROM t1 FULL OUTER JOIN t2 ON t2_id = t1_id ORDER BY t1_id", | ||
| ]; | ||
| for sql in equivalent_sql.iter() { | ||
| let actual = execute(&mut ctx, sql).await; | ||
| assert_eq!(expected, actual); | ||
| } | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
@@ -1821,15 +1865,19 @@ async fn left_join_using() -> Result<()> { | |
| #[tokio::test] | ||
| async fn equijoin_implicit_syntax() -> Result<()> { | ||
| let mut ctx = create_join_context("t1_id", "t2_id")?; | ||
| let sql = | ||
| "SELECT t1_id, t1_name, t2_name FROM t1, t2 WHERE t1_id = t2_id ORDER BY t1_id"; | ||
| let actual = execute(&mut ctx, sql).await; | ||
| let equivalent_sql = [ | ||
| "SELECT t1_id, t1_name, t2_name FROM t1, t2 WHERE t1_id = t2_id ORDER BY t1_id", | ||
| "SELECT t1_id, t1_name, t2_name FROM t1, t2 WHERE t2_id = t1_id ORDER BY t1_id", | ||
| ]; | ||
| let expected = vec![ | ||
| vec!["11", "a", "z"], | ||
| vec!["22", "b", "y"], | ||
| vec!["44", "d", "x"], | ||
| ]; | ||
| assert_eq!(expected, actual); | ||
| for sql in equivalent_sql.iter() { | ||
| let actual = execute(&mut ctx, sql).await; | ||
| assert_eq!(expected, actual); | ||
| } | ||
| Ok(()) | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't fully understand this code -- if
LogicalPlanBuilder::joinis called it has two inputs, a left (self) and a right (right) so the idea thatleft_keysmight contain references torightis confusing to me.I wonder if there might be a better place to fix up the errors -- perhaps somewhere further up in the call stack thatn
LogicalPlanBuilder::join-- for exampleextract_join_keysin the SQL planner (especially since this bug seems related to SQL planning)https://github.com/apache/arrow-datafusion/blob/master/datafusion/src/sql/planner.rs#L372-L380
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @alamb. Unfortunately my employment situation makes it hard to contribute too much but I am still very interested in seeing Datafusion succeed.
The
LogicalPlanBuilder::joinhasleftandrightkeys but they areleftandrightrelative to a join condition not to aleftorrightrelation.The
left_keysandright_keysare where the join conditions are resolved to underlying relations. The Postgres behavior does not require a specific order of conditions for joining so we need to be able to handle both conditions.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I think the argument name makes sense with the old assumption that left_keys should only refer to columns from the left plan. With this fix, it can become confusing.
Perhaps a better signature for this function would be something along the lines of:
join_keys: Vec<(impl Into<Column>, impl Into<Column>)>.If we want the dataframe API to also support order independent JOIN conditions, then it is indeed better to address the fix within this builder method. Otherwise I agree it would be better to fix it in the SQL planner instead. I am leaning towards supporting this in dataframe API as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
changing the signature to be
join_keys: Vec<(impl Into<Column>, impl Into<Column>)>.instead ofleft_keys:...andright_keys: ...makes sense to me