Skip to content
Permalink
Browse files

[MINOR][SQL] Rename config name to spark.sql.analyzer.failAmbiguousSe…

…lfJoin.enabled

### What changes were proposed in this pull request?

add `.enabled` postfix to `spark.sql.analyzer.failAmbiguousSelfJoin`.

### Why are the changes needed?

to follow the existing naming style

### Does this PR introduce any user-facing change?

no

### How was this patch tested?

not needed

Closes #26694 from cloud-fan/conf.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
  • Loading branch information
cloud-fan committed Dec 2, 2019
1 parent 4e073f3 commit e271664a01fd7dee63391890514d76262cad1bc1
@@ -111,7 +111,7 @@ license: |

- The result of `java.lang.Math`'s `log`, `log1p`, `exp`, `expm1`, and `pow` may vary across platforms. In Spark 3.0, the result of the equivalent SQL functions (including related SQL functions like `LOG10`) return values consistent with `java.lang.StrictMath`. In virtually all cases this makes no difference in the return value, and the difference is very small, but may not exactly match `java.lang.Math` on x86 platforms in cases like, for example, `log(3.0)`, whose value varies between `Math.log()` and `StrictMath.log()`.

- Since Spark 3.0, Dataset query fails if it contains ambiguous column reference that is caused by self join. A typical example: `val df1 = ...; val df2 = df1.filter(...);`, then `df1.join(df2, df1("a") > df2("a"))` returns an empty result which is quite confusing. This is because Spark cannot resolve Dataset column references that point to tables being self joined, and `df1("a")` is exactly the same as `df2("a")` in Spark. To restore the behavior before Spark 3.0, you can set `spark.sql.analyzer.failAmbiguousSelfJoin` to `false`.
- Since Spark 3.0, Dataset query fails if it contains ambiguous column reference that is caused by self join. A typical example: `val df1 = ...; val df2 = df1.filter(...);`, then `df1.join(df2, df1("a") > df2("a"))` returns an empty result which is quite confusing. This is because Spark cannot resolve Dataset column references that point to tables being self joined, and `df1("a")` is exactly the same as `df2("a")` in Spark. To restore the behavior before Spark 3.0, you can set `spark.sql.analyzer.failAmbiguousSelfJoin.enabled` to `false`.

- Since Spark 3.0, `Cast` function processes string literals such as 'Infinity', '+Infinity', '-Infinity', 'NaN', 'Inf', '+Inf', '-Inf' in case insensitive manner when casting the literals to `Double` or `Float` type to ensure greater compatibility with other database systems. This behaviour change is illustrated in the table below:
<table class="table">
@@ -875,8 +875,8 @@ object SQLConf {
.booleanConf
.createWithDefault(true)

val FAIL_AMBIGUOUS_SELF_JOIN =
buildConf("spark.sql.analyzer.failAmbiguousSelfJoin")
val FAIL_AMBIGUOUS_SELF_JOIN_ENABLED =
buildConf("spark.sql.analyzer.failAmbiguousSelfJoin.enabled")
.doc("When true, fail the Dataset query if it contains ambiguous self-join.")
.internal()
.booleanConf
@@ -229,7 +229,7 @@ class Dataset[T] private[sql](
case _ =>
queryExecution.analyzed
}
if (sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN)) {
if (sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED)) {
plan.setTagValue(Dataset.DATASET_ID_TAG, id)
}
plan
@@ -1337,7 +1337,7 @@ class Dataset[T] private[sql](
private def addDataFrameIdToCol(expr: NamedExpression): NamedExpression = {
val newExpr = expr transform {
case a: AttributeReference
if sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN) =>
if sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED) =>
val metadata = new MetadataBuilder()
.withMetadata(a.metadata)
.putLong(Dataset.DATASET_ID_KEY, id)
@@ -71,7 +71,7 @@ class DetectAmbiguousSelfJoin(conf: SQLConf) extends Rule[LogicalPlan] {
}

override def apply(plan: LogicalPlan): LogicalPlan = {
if (!conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN)) return plan
if (!conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED)) return plan

// We always remove the special metadata from `AttributeReference` at the end of this rule, so
// Dataset column reference only exists in the root node via Dataset transformations like
@@ -149,7 +149,7 @@ class DetectAmbiguousSelfJoin(conf: SQLConf) extends Rule[LogicalPlan] {
"to figure out which one. Please alias the Datasets with different names via " +
"`Dataset.as` before joining them, and specify the column using qualified name, e.g. " +
"""`df.as("a").join(df.as("b"), $"a.id" > $"b.id")`. You can also set """ +
s"${SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key} to false to disable this check.")
s"${SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key} to false to disable this check.")
}
}

@@ -96,7 +96,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
val df2 = df1.filter($"id" > 0)

withSQLConf(
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "false",
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "false",
SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
// `df1("id") > df2("id")` is always false.
checkAnswer(df1.join(df2, df1("id") > df2("id")), Nil)
@@ -110,7 +110,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
}

withSQLConf(
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true",
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true",
SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
assertAmbiguousSelfJoin(df1.join(df2, df1("id") > df2("id")))
}
@@ -121,7 +121,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
val df2 = df1.filter($"id" > 0)

withSQLConf(
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true",
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true",
SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
assertAmbiguousSelfJoin(df1.join(df2, df1.colRegex("id") > df2.colRegex("id")))
}
@@ -132,7 +132,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
val df2 = df1.filter($"a.b" > 0)

withSQLConf(
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true",
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true",
SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
assertAmbiguousSelfJoin(df1.join(df2, df1("a.b") > df2("a.c")))
}
@@ -143,7 +143,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
val df2 = df1.filter($"id" > 0)

withSQLConf(
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "false",
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "false",
SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
// `df2("id")` actually points to the column of `df1`.
checkAnswer(df1.join(df2).select(df2("id")), Seq(0, 0, 1, 1, 2, 2).map(Row(_)))
@@ -157,7 +157,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
}

withSQLConf(
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true",
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true",
SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
assertAmbiguousSelfJoin(df1.join(df2).select(df2("id")))
}
@@ -170,7 +170,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
val df4 = spark.range(1)

withSQLConf(
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "false",
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "false",
SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
// `df2("id") < df3("id")` is always false
checkAnswer(df1.join(df2).join(df3, df2("id") < df3("id")), Nil)
@@ -196,7 +196,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
}

withSQLConf(
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true",
SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true",
SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
assertAmbiguousSelfJoin(df1.join(df2).join(df3, df2("id") < df3("id")))
assertAmbiguousSelfJoin(df1.join(df4).join(df2).select(df2("id")))

0 comments on commit e271664

Please sign in to comment.
You can’t perform that action at this time.