From 16d07c6da03706b206f0d24317e2ff8b1e489cf3 Mon Sep 17 00:00:00 2001 From: Arman Date: Tue, 30 Oct 2018 12:16:11 +0330 Subject: [PATCH 1/2] Added function to join two datasets using one column and get the join type as a parameter --- .../scala/org/apache/spark/sql/Dataset.scala | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index c91b0d778fab1..47fac83d22c71 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -883,6 +883,31 @@ class Dataset[T] private[sql]( join(right, Seq(usingColumn)) } + /** + * Equi-join with another `DataFrame` using the given column. + * + * Different from other join functions, the join column will only appear once in the output, + * i.e. similar to SQL's `JOIN USING` syntax. + * + * {{{ + * // Left join of df1 and df2 using the column "user_id" + * df1.join(df2, "user_id", "left") + * }}} + * + * @param right Right side of the join operation. + * @param usingColumn Name of the column to join on. This column must exist on both sides. + * @param joinType Type of join to perform. Default `inner`. Must be one of: + * `inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`, + * `right`, `right_outer`, `left_semi`, `left_anti`. + * @note If you perform a self-join using this function without aliasing the input + * `DataFrame`s, you will NOT be able to reference any columns after the join, since + * there is no way to disambiguate which side of the join you would like to reference. + * @group untypedrel + */ + def join(right: Dataset[_], usingColumn: String, joinType: String): DataFrame = { + join(right, Seq(usingColumn), joinType) + } + /** * Inner equi-join with another `DataFrame` using the given columns. * From afbb07f9ea86c52c6a5e1598bc09be99289834bc Mon Sep 17 00:00:00 2001 From: Arman Date: Sun, 4 Nov 2018 09:54:26 +0330 Subject: [PATCH 2/2] Fixed indentation of docs --- .../scala/org/apache/spark/sql/Dataset.scala | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 47fac83d22c71..c52e9f3a4ecc7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -884,26 +884,26 @@ class Dataset[T] private[sql]( } /** - * Equi-join with another `DataFrame` using the given column. - * - * Different from other join functions, the join column will only appear once in the output, - * i.e. similar to SQL's `JOIN USING` syntax. - * - * {{{ - * // Left join of df1 and df2 using the column "user_id" - * df1.join(df2, "user_id", "left") - * }}} - * - * @param right Right side of the join operation. - * @param usingColumn Name of the column to join on. This column must exist on both sides. - * @param joinType Type of join to perform. Default `inner`. Must be one of: - * `inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`, - * `right`, `right_outer`, `left_semi`, `left_anti`. - * @note If you perform a self-join using this function without aliasing the input - * `DataFrame`s, you will NOT be able to reference any columns after the join, since - * there is no way to disambiguate which side of the join you would like to reference. - * @group untypedrel - */ + * Equi-join with another `DataFrame` using the given column. + * + * Different from other join functions, the join column will only appear once in the output, + * i.e. similar to SQL's `JOIN USING` syntax. + * + * {{{ + * // Left join of df1 and df2 using the column "user_id" + * df1.join(df2, "user_id", "left") + * }}} + * + * @param right Right side of the join operation. + * @param usingColumn Name of the column to join on. This column must exist on both sides. + * @param joinType Type of join to perform. Default `inner`. Must be one of: + * `inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`, + * `right`, `right_outer`, `left_semi`, `left_anti`. + * @note If you perform a self-join using this function without aliasing the input + * `DataFrame`s, you will NOT be able to reference any columns after the join, since + * there is no way to disambiguate which side of the join you would like to reference. + * @group untypedrel + */ def join(right: Dataset[_], usingColumn: String, joinType: String): DataFrame = { join(right, Seq(usingColumn), joinType) }