Skip to content

Commit

Permalink
simplify name conflict check in CTE resolution
Browse files Browse the repository at this point in the history
  • Loading branch information
cloud-fan committed Apr 27, 2020
1 parent 5dd581c commit dd6cec5
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 24 deletions.
Expand Up @@ -17,9 +17,11 @@

package org.apache.spark.sql.catalyst.analysis

import scala.collection.mutable

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, With}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias, With}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.{LEGACY_CTE_PRECEDENCE_POLICY, LegacyBehaviorPolicy}
Expand All @@ -41,34 +43,45 @@ object CTESubstitution extends Rule[LogicalPlan] {
}

/**
* Check the plan to be traversed has naming conflicts in nested CTE or not, traverse through
* child, innerChildren and subquery expressions for the current plan.
* Spark 3.0 changes the CTE relations resolution, and inner relations take precedence. This is
* correct but we need to warn users about this behavior change under EXCEPTION mode, when we see
* CTE relations with conflicting names.
*
* Note that, before Spark 3.0 the parser didn't support CTE in the FROM clause. For example,
* `WITH ... SELECT * FROM (WITH ... SELECT ...)` was not supported. We should not fail for this
* case, as Spark versions before 3.0 can't run it anyway. The parameter `startOfQuery` is used
* to indicate where we can define CTE relations before Spark 3.0, and we should only check
* name conflicts when `startOfQuery` is true.
*/
private def assertNoNameConflictsInCTE(
plan: LogicalPlan,
outerCTERelationNames: Set[String] = Set.empty,
namesInSubqueries: Set[String] = Set.empty): Unit = {
outerCTERelationNames: Seq[String] = Nil,
startOfQuery: Boolean = true): Unit = {
val resolver = SQLConf.get.resolver
plan match {
case w @ With(child, relations) =>
val newNames = relations.map {
case (cteName, _) =>
if (outerCTERelationNames.contains(cteName)) {
throw new AnalysisException(s"Name $cteName is ambiguous in nested CTE. " +
case With(child, relations) =>
val newNames = mutable.ArrayBuffer.empty[String]
newNames ++= outerCTERelationNames
relations.foreach {
case (name, relation) =>
if (startOfQuery && outerCTERelationNames.exists(resolver(_, name))) {
throw new AnalysisException(s"Name $name is ambiguous in nested CTE. " +
s"Please set ${LEGACY_CTE_PRECEDENCE_POLICY.key} to CORRECTED so that name " +
"defined in inner CTE takes precedence. If set it to LEGACY, outer CTE " +
"definitions will take precedence. See more details in SPARK-28228.")
} else {
cteName
}
}.toSet ++ namesInSubqueries
assertNoNameConflictsInCTE(child, outerCTERelationNames, newNames)
w.innerChildren.foreach(assertNoNameConflictsInCTE(_, newNames, newNames))
assertNoNameConflictsInCTE(relation, newNames)
newNames += name
}
assertNoNameConflictsInCTE(child, newNames, startOfQuery = false)

case s: SubqueryAlias if startOfQuery =>
assertNoNameConflictsInCTE(s.child, outerCTERelationNames)

case other =>
other.subqueries.foreach(
assertNoNameConflictsInCTE(_, namesInSubqueries, namesInSubqueries))
other.subqueries.foreach(assertNoNameConflictsInCTE(_, outerCTERelationNames))
other.children.foreach(
assertNoNameConflictsInCTE(_, outerCTERelationNames, namesInSubqueries))
assertNoNameConflictsInCTE(_, outerCTERelationNames, startOfQuery = false))
}
}

Expand Down
16 changes: 16 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
Expand Up @@ -111,3 +111,19 @@ WHERE c IN (
WITH t(c) AS (SELECT 2)
SELECT * FROM t
);

-- case insensitive name conflicts: in other CTE relations
WITH
abc AS (SELECT 1),
t AS (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
SELECT * FROM t;

-- case insensitive name conflicts: in subquery expressions
WITH abc AS (SELECT 1)
SELECT (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
);
28 changes: 27 additions & 1 deletion sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 13
-- Number of queries: 15


-- !query
Expand Down Expand Up @@ -179,3 +179,29 @@ WHERE c IN (
struct<c:int>
-- !query output
1


-- !query
WITH
abc AS (SELECT 1),
t AS (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
SELECT * FROM t
-- !query schema
struct<1:int>
-- !query output
1


-- !query
WITH abc AS (SELECT 1)
SELECT (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
-- !query schema
struct<scalarsubquery():int>
-- !query output
1
35 changes: 31 additions & 4 deletions sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 13
-- Number of queries: 15


-- !query
Expand Down Expand Up @@ -64,10 +64,9 @@ WITH
)
SELECT * FROM t2
-- !query schema
struct<>
struct<scalarsubquery():int>
-- !query output
org.apache.spark.sql.AnalysisException
Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
2


-- !query
Expand Down Expand Up @@ -186,3 +185,31 @@ struct<>
-- !query output
org.apache.spark.sql.AnalysisException
Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;


-- !query
WITH
abc AS (SELECT 1),
t AS (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
SELECT * FROM t
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;


-- !query
WITH abc AS (SELECT 1)
SELECT (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 13
-- Number of queries: 15


-- !query
Expand Down Expand Up @@ -179,3 +179,29 @@ WHERE c IN (
struct<c:int>
-- !query output



-- !query
WITH
abc AS (SELECT 1),
t AS (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
SELECT * FROM t
-- !query schema
struct<2:int>
-- !query output
2


-- !query
WITH abc AS (SELECT 1)
SELECT (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
-- !query schema
struct<scalarsubquery():int>
-- !query output
2

0 comments on commit dd6cec5

Please sign in to comment.