simplify name conflict check in CTE resolution

apache · Apr 27, 2020 · dd6cec5 · dd6cec5
1 parent 5dd581c
commit dd6cec5
Show file tree

Hide file tree

Showing 5 changed files with 132 additions and 24 deletions.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import scala.collection.mutable
+
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, With}
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias, With}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.{LEGACY_CTE_PRECEDENCE_POLICY, LegacyBehaviorPolicy}
@@ -41,34 +43,45 @@ object CTESubstitution extends Rule[LogicalPlan] {
   }
 
   /**
-   * Check the plan to be traversed has naming conflicts in nested CTE or not, traverse through
-   * child, innerChildren and subquery expressions for the current plan.
+   * Spark 3.0 changes the CTE relations resolution, and inner relations take precedence. This is
+   * correct but we need to warn users about this behavior change under EXCEPTION mode, when we see
+   * CTE relations with conflicting names.
+   *
+   * Note that, before Spark 3.0 the parser didn't support CTE in the FROM clause. For example,
+   * `WITH ... SELECT * FROM (WITH ... SELECT ...)` was not supported. We should not fail for this
+   * case, as Spark versions before 3.0 can't run it anyway. The parameter `startOfQuery` is used
+   * to indicate where we can define CTE relations before Spark 3.0, and we should only check
+   * name conflicts when `startOfQuery` is true.
    */
   private def assertNoNameConflictsInCTE(
       plan: LogicalPlan,
-      outerCTERelationNames: Set[String] = Set.empty,
-      namesInSubqueries: Set[String] = Set.empty): Unit = {
+      outerCTERelationNames: Seq[String] = Nil,
+      startOfQuery: Boolean = true): Unit = {
+    val resolver = SQLConf.get.resolver
     plan match {
-      case w @ With(child, relations) =>
-        val newNames = relations.map {
-          case (cteName, _) =>
-            if (outerCTERelationNames.contains(cteName)) {
-              throw new AnalysisException(s"Name $cteName is ambiguous in nested CTE. " +
+      case With(child, relations) =>
+        val newNames = mutable.ArrayBuffer.empty[String]
+        newNames ++= outerCTERelationNames
+        relations.foreach {
+          case (name, relation) =>
+            if (startOfQuery && outerCTERelationNames.exists(resolver(_, name))) {
+              throw new AnalysisException(s"Name $name is ambiguous in nested CTE. " +
                 s"Please set ${LEGACY_CTE_PRECEDENCE_POLICY.key} to CORRECTED so that name " +
                 "defined in inner CTE takes precedence. If set it to LEGACY, outer CTE " +
                 "definitions will take precedence. See more details in SPARK-28228.")
-            } else {
-              cteName
             }
-        }.toSet ++ namesInSubqueries
-        assertNoNameConflictsInCTE(child, outerCTERelationNames, newNames)
-        w.innerChildren.foreach(assertNoNameConflictsInCTE(_, newNames, newNames))
+            assertNoNameConflictsInCTE(relation, newNames)
+            newNames += name
+        }
+        assertNoNameConflictsInCTE(child, newNames, startOfQuery = false)
+
+      case s: SubqueryAlias if startOfQuery =>
+        assertNoNameConflictsInCTE(s.child, outerCTERelationNames)
 
       case other =>
-        other.subqueries.foreach(
-          assertNoNameConflictsInCTE(_, namesInSubqueries, namesInSubqueries))
+        other.subqueries.foreach(assertNoNameConflictsInCTE(_, outerCTERelationNames))
         other.children.foreach(
-          assertNoNameConflictsInCTE(_, outerCTERelationNames, namesInSubqueries))
+          assertNoNameConflictsInCTE(_, outerCTERelationNames, startOfQuery = false))
     }
   }
 

diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
@@ -111,3 +111,19 @@ WHERE c IN (
   WITH t(c) AS (SELECT 2)
   SELECT * FROM t
 );
+
+-- case insensitive name conflicts: in other CTE relations
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t;
+
+-- case insensitive name conflicts: in subquery expressions
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+);
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 15
 
 
 -- !query
@@ -179,3 +179,29 @@ WHERE c IN (
 struct<c:int>
 -- !query output
 1
+
+
+-- !query
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+)
+-- !query schema
+struct<scalarsubquery():int>
+-- !query output
+1
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 15
 
 
 -- !query
@@ -64,10 +64,9 @@ WITH
   )
 SELECT * FROM t2
 -- !query schema
-struct<>
+struct<scalarsubquery():int>
 -- !query output
-org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+2
 
 
 -- !query
@@ -186,3 +185,31 @@ struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
 Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 15
 
 
 -- !query
@@ -179,3 +179,29 @@ WHERE c IN (
 struct<c:int>
 -- !query output
 
+
+
+-- !query
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t
+-- !query schema
+struct<2:int>
+-- !query output
+2
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+)
+-- !query schema
+struct<scalarsubquery():int>
+-- !query output
+2