From 1c5ab03470ba1e99826c3238c0c6ae6fafef22ce Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Thu, 25 Feb 2021 16:00:46 -0800 Subject: [PATCH 01/34] Only use metadata columns for resolution as last resort Signed-off-by: Karen Feng --- .../catalyst/plans/logical/LogicalPlan.scala | 6 ++-- .../sql/connector/DataSourceV2SQLSuite.scala | 29 +++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 89d65e13fb57f..ceb9726bc6d69 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -89,8 +89,9 @@ abstract class LogicalPlan } } - private[this] lazy val childAttributes = - AttributeSeq(children.flatMap(c => c.output ++ c.metadataOutput)) + private[this] lazy val childAttributes = AttributeSeq(children.flatMap(_.output)) + + private[this] lazy val childMetadataAttributes = AttributeSeq(children.flatMap(_.metadataOutput)) private[this] lazy val outputAttributes = AttributeSeq(output) @@ -103,6 +104,7 @@ abstract class LogicalPlan nameParts: Seq[String], resolver: Resolver): Option[NamedExpression] = childAttributes.resolve(nameParts, resolver) + .orElse(childMetadataAttributes.resolve(nameParts, resolver)) /** * Optionally resolves the given strings to a [[NamedExpression]] based on the output of this diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 2f57298856fb5..d9dd4a24f8ae0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -2492,6 +2492,35 @@ class DataSourceV2SQLSuite } } + test("SPARK-34547: metadata columns are resolved last") { + val t1 = s"${catalogAndNamespace}tableOne" + val t2 = s"${catalogAndNamespace}tableTwo" + withTable(t1, t2) { + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " + + "PARTITIONED BY (bucket(4, id), id)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')") + sql(s"CREATE TABLE $t2 (id bigint, index int) USING $v2Format") + sql(s"INSERT INTO $t2 VALUES (1, -1), (2, -2), (3, -3)") + + val sqlQuery = spark.sql(s"SELECT $t1.id, $t2.id, data, index, $t1.index, $t2.index FROM " + + s"$t1 JOIN $t2 WHERE $t1.id = $t2.id") + val t1Table = spark.table(t1) + val t2Table = spark.table(t2) + val dfQuery = t1Table.join(t2Table, t1Table.col("id") === t2Table.col("id")) + .select(s"$t1.id", s"$t2.id", "data", "index", s"$t1.index", s"$t2.index") + + Seq(sqlQuery, dfQuery).foreach { query => + checkAnswer(query, + Seq( + Row(1, 1, "a", -1, 0, -1), + Row(2, 2, "b", -2, 0, -2), + Row(3, 3, "c", -3, 0, -3) + ) + ) + } + } + } + test("SPARK-33505: insert into partitioned table") { val t = "testpart.ns1.ns2.tbl" withTable(t) { From 2fe733fe05bcce7a471eb715e58b490e05109ed9 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Fri, 26 Feb 2021 10:40:24 -0800 Subject: [PATCH 02/34] Resolve deduplicated common columns in NATURAL/USING JOIN with hidden column Signed-off-by: Karen Feng --- .../sql/catalyst/analysis/Analyzer.scala | 110 +++--- .../sql/catalyst/analysis/unresolved.scala | 15 +- .../plans/logical/basicLogicalOperators.scala | 12 +- .../spark/sql/catalyst/util/package.scala | 27 +- .../v2/DataSourceV2Implicits.scala | 10 +- .../sql-tests/inputs/natural-join.sql | 57 ++- .../resources/sql-tests/inputs/using-join.sql | 70 ++++ .../sql-tests/results/natural-join.sql.out | 215 ++++++++++- .../sql-tests/results/using-join.sql.out | 338 ++++++++++++++++++ 9 files changed, 769 insertions(+), 85 deletions(-) create mode 100644 sql/core/src/test/resources/sql-tests/inputs/using-join.sql create mode 100644 sql/core/src/test/resources/sql-tests/results/using-join.sql.out diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 3952cc063b73c..9518b3d4f29b2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -977,7 +977,7 @@ class Analyzer(override val catalogManager: CatalogManager) * * References to metadata columns are resolved using columns from [[LogicalPlan.metadataOutput]], * but the relation's output does not include the metadata columns until the relation is replaced - * using [[DataSourceV2Relation.withMetadataColumns()]]. Unless this rule adds metadata to the + * with a copy adding them to the output. Unless this rule adds metadata to the relation's output, * relation's output, the analyzer will detect that nothing produces the columns. * * This rule only adds metadata columns when a node is resolved but is missing input from its @@ -986,7 +986,7 @@ class Analyzer(override val catalogManager: CatalogManager) * columns are not accidentally selected by *. */ object AddMetadataColumns extends Rule[LogicalPlan] { - import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ + import org.apache.spark.sql.catalyst.util._ private def hasMetadataCol(plan: LogicalPlan): Boolean = { plan.expressions.exists(_.find { @@ -997,6 +997,9 @@ class Analyzer(override val catalogManager: CatalogManager) private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { case r: DataSourceV2Relation => r.withMetadataColumns() + case p: Project => p.copy( + projectList = p.metadataOutput ++ p.projectList, + child = addMetadataCol(p.child)) case _ => plan.withNewChildren(plan.children.map(addMetadataCol)) } @@ -1010,7 +1013,7 @@ class Analyzer(override val catalogManager: CatalogManager) node } else { val newNode = addMetadataCol(node) - // We should not change the output schema of the plan. We should project away the extr + // We should not change the output schema of the plan. We should project away the extra // metadata columns if necessary. if (newNode.sameOutput(node)) { newNode @@ -3281,6 +3284,59 @@ class Analyzer(override val catalogManager: CatalogManager) * Then apply a Project on a normal Join to eliminate natural or using join. */ object ResolveNaturalAndUsingJoin extends Rule[LogicalPlan] { + private def commonNaturalJoinProcessing( + left: LogicalPlan, + right: LogicalPlan, + joinType: JoinType, + joinNames: Seq[String], + condition: Option[Expression], + hint: JoinHint): LogicalPlan = { + import org.apache.spark.sql.catalyst.util._ + + val leftKeys = joinNames.map { keyName => + left.output.find(attr => resolver(attr.name, keyName)).getOrElse { + throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, left, "left") + } + } + val rightKeys = joinNames.map { keyName => + right.output.find(attr => resolver(attr.name, keyName)).getOrElse { + throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, right, "right") + } + } + val joinPairs = leftKeys.zip(rightKeys) + + val newCondition = (condition ++ joinPairs.map(EqualTo.tupled)).reduceOption(And) + + // columns not in joinPairs + val lUniqueOutput = left.output.filterNot(att => leftKeys.contains(att)) + val rUniqueOutput = right.output.filterNot(att => rightKeys.contains(att)) + + // the output list looks like: join keys, columns from left, columns from right + val (projectList, hiddenList) = joinType match { + case LeftOuter => + (leftKeys ++ lUniqueOutput ++ rUniqueOutput.map(_.withNullability(true)), rightKeys) + case LeftExistence(_) => + (leftKeys ++ lUniqueOutput, Seq.empty) + case RightOuter => + (rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ rUniqueOutput, leftKeys) + case FullOuter => + // in full outer join, joinCols should be non-null if there is. + val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, r)), l.name)() } + (joinedCols ++ + lUniqueOutput.map(_.withNullability(true)) ++ + rUniqueOutput.map(_.withNullability(true)), + leftKeys ++ rightKeys) + case _ : InnerLike => + (leftKeys ++ lUniqueOutput ++ rUniqueOutput, rightKeys) + case _ => + sys.error("Unsupported natural join type " + joinType) + } + // use Project to hide duplicated common keys + val project = Project(projectList, Join(left, right, joinType, newCondition, hint)) + project.setTagValue(project.hiddenOutputTag, hiddenList.map(_.asHiddenCol())) + project + } + override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { case j @ Join(left, right, UsingJoin(joinType, usingCols), _, hint) if left.resolved && right.resolved && j.duplicateResolved => @@ -3368,54 +3424,6 @@ class Analyzer(override val catalogManager: CatalogManager) } } - private def commonNaturalJoinProcessing( - left: LogicalPlan, - right: LogicalPlan, - joinType: JoinType, - joinNames: Seq[String], - condition: Option[Expression], - hint: JoinHint) = { - val leftKeys = joinNames.map { keyName => - left.output.find(attr => resolver(attr.name, keyName)).getOrElse { - throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, left, "left") - } - } - val rightKeys = joinNames.map { keyName => - right.output.find(attr => resolver(attr.name, keyName)).getOrElse { - throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, right, "right") - } - } - val joinPairs = leftKeys.zip(rightKeys) - - val newCondition = (condition ++ joinPairs.map(EqualTo.tupled)).reduceOption(And) - - // columns not in joinPairs - val lUniqueOutput = left.output.filterNot(att => leftKeys.contains(att)) - val rUniqueOutput = right.output.filterNot(att => rightKeys.contains(att)) - - // the output list looks like: join keys, columns from left, columns from right - val projectList = joinType match { - case LeftOuter => - leftKeys ++ lUniqueOutput ++ rUniqueOutput.map(_.withNullability(true)) - case LeftExistence(_) => - leftKeys ++ lUniqueOutput - case RightOuter => - rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ rUniqueOutput - case FullOuter => - // in full outer join, joinCols should be non-null if there is. - val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, r)), l.name)() } - joinedCols ++ - lUniqueOutput.map(_.withNullability(true)) ++ - rUniqueOutput.map(_.withNullability(true)) - case _ : InnerLike => - leftKeys ++ lUniqueOutput ++ rUniqueOutput - case _ => - sys.error("Unsupported natural join type " + joinType) - } - // use Project to trim unnecessary fields - Project(projectList, Join(left, right, joinType, newCondition, hint)) - } - /** * Replaces [[UnresolvedDeserializer]] with the deserialization expression that has been resolved * to the given input attributes. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index 5b8fc51b7a654..a330f5fa4bf7c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.parser.ParserUtils import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, UnaryNode} -import org.apache.spark.sql.catalyst.util.quoteIdentifier +import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.types.{DataType, Metadata, StructType} @@ -316,11 +316,11 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu * Returns true if the nameParts is a subset of the last elements of qualifier of the attribute. * * For example, the following should all return true: - * - `SELECT ns1.ns2.t.* FROM ns1.n2.t` where nameParts is Seq("ns1", "ns2", "t") and + * - `SELECT ns1.ns2.t.* FROM ns1.ns2.t` where nameParts is Seq("ns1", "ns2", "t") and * qualifier is Seq("ns1", "ns2", "t"). - * - `SELECT ns2.t.* FROM ns1.n2.t` where nameParts is Seq("ns2", "t") and + * - `SELECT ns2.t.* FROM ns1.ns2.t` where nameParts is Seq("ns2", "t") and * qualifier is Seq("ns1", "ns2", "t"). - * - `SELECT t.* FROM ns1.n2.t` where nameParts is Seq("t") and + * - `SELECT t.* FROM ns1.ns2.t` where nameParts is Seq("t") and * qualifier is Seq("ns1", "ns2", "t"). */ private def matchedQualifier( @@ -342,10 +342,13 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu override def expand( input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = { - // If there is no table specified, use all input attributes. + // If there is no table specified, use all non-hidden input attributes. if (target.isEmpty) return input.output - val expandedAttributes = input.output.filter(matchedQualifier(_, target.get, resolver)) + // If there is a table specified, use hidden input attributes as well + val hiddenOutput = input.metadataOutput.filter(_.isHiddenCol) + val expandedAttributes = (hiddenOutput ++ input.output).filter( + matchedQualifier(_, target.get, resolver)) if (expandedAttributes.nonEmpty) return expandedAttributes diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 3f20d8f67b44d..99f1ac1e6ff9f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -25,7 +25,8 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning} -import org.apache.spark.sql.catalyst.util.truncatedString +import org.apache.spark.sql.catalyst.trees.TreeNodeTag +import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.util.random.RandomSampler @@ -76,6 +77,13 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) override lazy val validConstraints: ExpressionSet = getAllValidConstraints(projectList) + + val hiddenOutputTag: TreeNodeTag[Seq[Attribute]] = TreeNodeTag[Seq[Attribute]]("hiddenOutput") + + override def metadataOutput: Seq[Attribute] = { + child.metadataOutput ++ + getTagValue(hiddenOutputTag).getOrElse(Seq.empty[Attribute]) + } } /** @@ -950,7 +958,7 @@ case class SubqueryAlias( override def metadataOutput: Seq[Attribute] = { val qualifierList = identifier.qualifier :+ alias - child.metadataOutput.map(_.withQualifier(qualifierList)) + child.metadataOutput.filterNot(_.isHiddenCol).map(_.withQualifier(qualifierList)) } override def doCanonicalize(): LogicalPlan = child.canonicalized diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala index a041fb1aed78f..3ed9b6fe30d55 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala @@ -25,7 +25,7 @@ import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{NumericType, StringType} +import org.apache.spark.sql.types.{MetadataBuilder, NumericType, StringType} import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils @@ -193,4 +193,29 @@ package object util extends Logging { def truncatedString[T](seq: Seq[T], sep: String, maxFields: Int): String = { truncatedString(seq, "", sep, "", maxFields) } + + val METADATA_COL_ATTR_KEY = "__metadata_col" + implicit class MetadataColumnHelper(attr: Attribute) { + def isMetadataCol: Boolean = attr.metadata.contains(METADATA_COL_ATTR_KEY) && + attr.metadata.getBoolean(METADATA_COL_ATTR_KEY) + } + + /** + * Hidden columns are a type of metadata column that are not propagated through subquery aliases, + * and are candidates during qualified star expansions. + */ + val HIDDEN_COL_ATTR_KEY = "__hidden_col" + implicit class HiddenColumnHelper(attr: Attribute) { + def isHiddenCol: Boolean = attr.isMetadataCol && + attr.metadata.contains(HIDDEN_COL_ATTR_KEY) && + attr.metadata.getBoolean(HIDDEN_COL_ATTR_KEY) + + def asHiddenCol(): Attribute = attr.withMetadata( + new MetadataBuilder() + .withMetadata(attr.metadata) + .putBoolean(METADATA_COL_ATTR_KEY, true) + .putBoolean(HIDDEN_COL_ATTR_KEY, true) + .build() + ) + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Implicits.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Implicits.scala index 4326c730f88dd..f388b48732791 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Implicits.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Implicits.scala @@ -21,14 +21,13 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.{PartitionSpec, ResolvedPartitionSpec, UnresolvedPartitionSpec} -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.expressions.AttributeReference +import org.apache.spark.sql.catalyst.util.METADATA_COL_ATTR_KEY import org.apache.spark.sql.connector.catalog.{MetadataColumn, SupportsAtomicPartitionManagement, SupportsDelete, SupportsPartitionManagement, SupportsRead, SupportsWrite, Table, TableCapability} import org.apache.spark.sql.types.{MetadataBuilder, StructField, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap object DataSourceV2Implicits { - private val METADATA_COL_ATTR_KEY = "__metadata_col" - implicit class TableHelper(table: Table) { def asReadable: SupportsRead = { table match { @@ -95,11 +94,6 @@ object DataSourceV2Implicits { def toAttributes: Seq[AttributeReference] = asStruct.toAttributes } - implicit class MetadataColumnHelper(attr: Attribute) { - def isMetadataCol: Boolean = attr.metadata.contains(METADATA_COL_ATTR_KEY) && - attr.metadata.getBoolean(METADATA_COL_ATTR_KEY) - } - implicit class OptionsHelper(options: Map[String, String]) { def asOptions: CaseInsensitiveStringMap = { new CaseInsensitiveStringMap(options.asJava) diff --git a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql index 71a50157b766c..b3f654a7e918f 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql @@ -1,15 +1,22 @@ create temporary view nt1 as select * from values - ("one", 1), - ("two", 2), - ("three", 3) - as nt1(k, v1); + ("one", 1), + ("two", 2), + ("three", 3) + as nt1(k, v1); create temporary view nt2 as select * from values - ("one", 1), - ("two", 22), - ("one", 5) - as nt2(k, v2); + ("one", 1), + ("two", 22), + ("one", 5) + as nt2(k, v2); +create temporary view nt3 as select * from values + ("one", 4), + ("two", 5), + ("one", 6) + as nt3(k, v3); + +SELECT * FROM nt1 natural join nt2; SELECT * FROM nt1 natural join nt2 where k = "one"; @@ -18,3 +25,37 @@ SELECT * FROM nt1 natural left join nt2 order by v1, v2; SELECT * FROM nt1 natural right join nt2 order by v1, v2; SELECT count(*) FROM nt1 natural full outer join nt2; + +SELECT k FROM nt1 natural join nt2; + +SELECT k FROM nt1 natural join nt2 where k = "one"; + +SELECT nt1.* FROM nt1 natural join nt2; + +SELECT nt2.* FROM nt1 natural join nt2; + +SELECT sbq.* from (SELECT * FROM nt1 natural join nt2) sbq; + +SELECT nt1.*, nt2.* FROM nt1 natural join nt2; + +SELECT *, nt2.k FROM nt1 natural join nt2; + +SELECT nt1.k, nt2.k FROM nt1 natural join nt2; + +SELECT nt1.k, nt2.k FROM nt1 natural join nt2 where k = "one"; + +SELECT * FROM (SELECT * FROM nt1 natural join nt2); + +SELECT * FROM (SELECT nt1.*, nt2.* FROM nt1 natural join nt2); + +SELECT * FROM (SELECT nt1.v1, nt2.k FROM nt1 natural join nt2); + +SELECT nt2.k FROM (SELECT * FROM nt1 natural join nt2); + +SELECT * FROM nt1 natural join nt2 natural join nt3; + +SELECT nt1.*, nt2.*, nt3.* FROM nt1 natural join nt2 natural join nt3; + +SELECT * FROM nt1 natural join nt2 join nt3 on nt1.k = nt3.k; + +SELECT * FROM nt1 natural join nt2 join nt3 on nt2.k = nt3.k; diff --git a/sql/core/src/test/resources/sql-tests/inputs/using-join.sql b/sql/core/src/test/resources/sql-tests/inputs/using-join.sql new file mode 100644 index 0000000000000..5d1859049acf4 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/using-join.sql @@ -0,0 +1,70 @@ +create temporary view nt1 as select * from values + ("one", 1), + ("two", 2), + ("three", 3) + as nt1(k, v1); + +create temporary view nt2 as select * from values + ("one", 1), + ("two", 22), + ("one", 5), + ("four", 4) + as nt2(k, v2); + +SELECT * FROM nt1 left outer join nt2 using (k); + +SELECT k FROM nt1 left outer join nt2 using (k); + +SELECT nt1.*, nt2.* FROM nt1 left outer join nt2 using (k); + +SELECT nt1.k, nt2.k FROM nt1 left outer join nt2 using (k); + +SELECT k, nt1.k FROM nt1 left outer join nt2 using (k); + +SELECT k, nt2.k FROM nt1 left outer join nt2 using (k); + +SELECT * FROM nt1 left semi join nt2 using (k); + +SELECT k FROM nt1 left semi join nt2 using (k); + +SELECT nt1.* FROM nt1 left semi join nt2 using (k); + +SELECT nt1.k FROM nt1 left semi join nt2 using (k); + +SELECT k, nt1.k FROM nt1 left semi join nt2 using (k); + +SELECT * FROM nt1 right outer join nt2 using (k); + +SELECT k FROM nt1 right outer join nt2 using (k); + +SELECT nt1.*, nt2.* FROM nt1 right outer join nt2 using (k); + +SELECT nt1.k, nt2.k FROM nt1 right outer join nt2 using (k); + +SELECT k, nt1.k FROM nt1 right outer join nt2 using (k); + +SELECT k, nt2.k FROM nt1 right outer join nt2 using (k); + +SELECT * FROM nt1 full outer join nt2 using (k); + +SELECT k FROM nt1 full outer join nt2 using (k); + +SELECT nt1.*, nt2.* FROM nt1 full outer join nt2 using (k); + +SELECT nt1.k, nt2.k FROM nt1 full outer join nt2 using (k); + +SELECT k, nt1.k FROM nt1 full outer join nt2 using (k); + +SELECT k, nt2.k FROM nt1 full outer join nt2 using (k); + +SELECT * FROM nt1 full outer join nt2 using (k); + +SELECT k FROM nt1 inner join nt2 using (k); + +SELECT nt1.*, nt2.* FROM nt1 inner join nt2 using (k); + +SELECT nt1.k, nt2.k FROM nt1 inner join nt2 using (k); + +SELECT k, nt1.k FROM nt1 inner join nt2 using (k); + +SELECT k, nt2.k FROM nt1 inner join nt2 using (k); diff --git a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out index 13f319700df3f..4ed2c09cdc7dc 100644 --- a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out @@ -1,13 +1,13 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 6 +-- Number of queries: 25 -- !query create temporary view nt1 as select * from values - ("one", 1), - ("two", 2), - ("three", 3) - as nt1(k, v1) + ("one", 1), + ("two", 2), + ("three", 3) + as nt1(k, v1) -- !query schema struct<> -- !query output @@ -16,16 +16,38 @@ struct<> -- !query create temporary view nt2 as select * from values - ("one", 1), - ("two", 22), - ("one", 5) - as nt2(k, v2) + ("one", 1), + ("two", 22), + ("one", 5) + as nt2(k, v2) -- !query schema struct<> -- !query output +-- !query +create temporary view nt3 as select * from values + ("one", 4), + ("two", 5), + ("one", 6) + as nt3(k, v3) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM nt1 natural join nt2 +-- !query schema +struct +-- !query output +one 1 1 +one 1 5 +two 2 22 + + -- !query SELECT * FROM nt1 natural join nt2 where k = "one" -- !query schema @@ -62,3 +84,178 @@ SELECT count(*) FROM nt1 natural full outer join nt2 struct -- !query output 4 + + +-- !query +SELECT k FROM nt1 natural join nt2 +-- !query schema +struct +-- !query output +one +one +two + + +-- !query +SELECT k FROM nt1 natural join nt2 where k = "one" +-- !query schema +struct +-- !query output +one +one + + +-- !query +SELECT nt1.* FROM nt1 natural join nt2 +-- !query schema +struct +-- !query output +one 1 +one 1 +two 2 + + +-- !query +SELECT nt2.* FROM nt1 natural join nt2 +-- !query schema +struct +-- !query output +one 1 +one 5 +two 22 + + +-- !query +SELECT sbq.* from (SELECT * FROM nt1 natural join nt2) sbq +-- !query schema +struct +-- !query output +one 1 1 +one 1 5 +two 2 22 + + +-- !query +SELECT nt1.*, nt2.* FROM nt1 natural join nt2 +-- !query schema +struct +-- !query output +one 1 one 1 +one 1 one 5 +two 2 two 22 + + +-- !query +SELECT *, nt2.k FROM nt1 natural join nt2 +-- !query schema +struct +-- !query output +one 1 1 one +one 1 5 one +two 2 22 two + + +-- !query +SELECT nt1.k, nt2.k FROM nt1 natural join nt2 +-- !query schema +struct +-- !query output +one one +one one +two two + + +-- !query +SELECT nt1.k, nt2.k FROM nt1 natural join nt2 where k = "one" +-- !query schema +struct +-- !query output +one one +one one + + +-- !query +SELECT * FROM (SELECT * FROM nt1 natural join nt2) +-- !query schema +struct +-- !query output +one 1 1 +one 1 5 +two 2 22 + + +-- !query +SELECT * FROM (SELECT nt1.*, nt2.* FROM nt1 natural join nt2) +-- !query schema +struct +-- !query output +one 1 one 1 +one 1 one 5 +two 2 two 22 + + +-- !query +SELECT * FROM (SELECT nt1.v1, nt2.k FROM nt1 natural join nt2) +-- !query schema +struct +-- !query output +1 one +1 one +2 two + + +-- !query +SELECT nt2.k FROM (SELECT * FROM nt1 natural join nt2) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '`nt2.k`' given input columns: [__auto_generated_subquery_name.k, __auto_generated_subquery_name.v1, __auto_generated_subquery_name.v2]; line 1 pos 7 + + +-- !query +SELECT * FROM nt1 natural join nt2 natural join nt3 +-- !query schema +struct +-- !query output +one 1 1 4 +one 1 1 6 +one 1 5 4 +one 1 5 6 +two 2 22 5 + + +-- !query +SELECT nt1.*, nt2.*, nt3.* FROM nt1 natural join nt2 natural join nt3 +-- !query schema +struct +-- !query output +one 1 one 1 one 4 +one 1 one 1 one 6 +one 1 one 5 one 4 +one 1 one 5 one 6 +two 2 two 22 two 5 + + +-- !query +SELECT * FROM nt1 natural join nt2 join nt3 on nt1.k = nt3.k +-- !query schema +struct +-- !query output +one 1 1 one 4 +one 1 1 one 6 +one 1 5 one 4 +one 1 5 one 6 +two 2 22 two 5 + + +-- !query +SELECT * FROM nt1 natural join nt2 join nt3 on nt2.k = nt3.k +-- !query schema +struct +-- !query output +one 1 1 one 4 +one 1 1 one 6 +one 1 5 one 4 +one 1 5 one 6 +two 2 22 two 5 diff --git a/sql/core/src/test/resources/sql-tests/results/using-join.sql.out b/sql/core/src/test/resources/sql-tests/results/using-join.sql.out new file mode 100644 index 0000000000000..c078cbaf2eefd --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/using-join.sql.out @@ -0,0 +1,338 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 31 + + +-- !query +create temporary view nt1 as select * from values + ("one", 1), + ("two", 2), + ("three", 3) + as nt1(k, v1) +-- !query schema +struct<> +-- !query output + + + +-- !query +create temporary view nt2 as select * from values + ("one", 1), + ("two", 22), + ("one", 5), + ("four", 4) + as nt2(k, v2) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM nt1 left outer join nt2 using (k) +-- !query schema +struct +-- !query output +one 1 1 +one 1 5 +three 3 NULL +two 2 22 + + +-- !query +SELECT k FROM nt1 left outer join nt2 using (k) +-- !query schema +struct +-- !query output +one +one +three +two + + +-- !query +SELECT nt1.*, nt2.* FROM nt1 left outer join nt2 using (k) +-- !query schema +struct +-- !query output +one 1 one 1 +one 1 one 5 +three 3 NULL NULL +two 2 two 22 + + +-- !query +SELECT nt1.k, nt2.k FROM nt1 left outer join nt2 using (k) +-- !query schema +struct +-- !query output +one one +one one +three NULL +two two + + +-- !query +SELECT k, nt1.k FROM nt1 left outer join nt2 using (k) +-- !query schema +struct +-- !query output +one one +one one +three three +two two + + +-- !query +SELECT k, nt2.k FROM nt1 left outer join nt2 using (k) +-- !query schema +struct +-- !query output +one one +one one +three NULL +two two + + +-- !query +SELECT * FROM nt1 left semi join nt2 using (k) +-- !query schema +struct +-- !query output +one 1 +two 2 + + +-- !query +SELECT k FROM nt1 left semi join nt2 using (k) +-- !query schema +struct +-- !query output +one +two + + +-- !query +SELECT nt1.* FROM nt1 left semi join nt2 using (k) +-- !query schema +struct +-- !query output +one 1 +two 2 + + +-- !query +SELECT nt1.k FROM nt1 left semi join nt2 using (k) +-- !query schema +struct +-- !query output +one +two + + +-- !query +SELECT k, nt1.k FROM nt1 left semi join nt2 using (k) +-- !query schema +struct +-- !query output +one one +two two + + +-- !query +SELECT * FROM nt1 right outer join nt2 using (k) +-- !query schema +struct +-- !query output +four NULL 4 +one 1 1 +one 1 5 +two 2 22 + + +-- !query +SELECT k FROM nt1 right outer join nt2 using (k) +-- !query schema +struct +-- !query output +four +one +one +two + + +-- !query +SELECT nt1.*, nt2.* FROM nt1 right outer join nt2 using (k) +-- !query schema +struct +-- !query output +NULL NULL four 4 +one 1 one 1 +one 1 one 5 +two 2 two 22 + + +-- !query +SELECT nt1.k, nt2.k FROM nt1 right outer join nt2 using (k) +-- !query schema +struct +-- !query output +NULL four +one one +one one +two two + + +-- !query +SELECT k, nt1.k FROM nt1 right outer join nt2 using (k) +-- !query schema +struct +-- !query output +four NULL +one one +one one +two two + + +-- !query +SELECT k, nt2.k FROM nt1 right outer join nt2 using (k) +-- !query schema +struct +-- !query output +four four +one one +one one +two two + + +-- !query +SELECT * FROM nt1 full outer join nt2 using (k) +-- !query schema +struct +-- !query output +four NULL 4 +one 1 1 +one 1 5 +three 3 NULL +two 2 22 + + +-- !query +SELECT k FROM nt1 full outer join nt2 using (k) +-- !query schema +struct +-- !query output +four +one +one +three +two + + +-- !query +SELECT nt1.*, nt2.* FROM nt1 full outer join nt2 using (k) +-- !query schema +struct +-- !query output +NULL NULL four 4 +one 1 one 1 +one 1 one 5 +three 3 NULL NULL +two 2 two 22 + + +-- !query +SELECT nt1.k, nt2.k FROM nt1 full outer join nt2 using (k) +-- !query schema +struct +-- !query output +NULL four +one one +one one +three NULL +two two + + +-- !query +SELECT k, nt1.k FROM nt1 full outer join nt2 using (k) +-- !query schema +struct +-- !query output +four NULL +one one +one one +three three +two two + + +-- !query +SELECT k, nt2.k FROM nt1 full outer join nt2 using (k) +-- !query schema +struct +-- !query output +four four +one one +one one +three NULL +two two + + +-- !query +SELECT * FROM nt1 full outer join nt2 using (k) +-- !query schema +struct +-- !query output +four NULL 4 +one 1 1 +one 1 5 +three 3 NULL +two 2 22 + + +-- !query +SELECT k FROM nt1 inner join nt2 using (k) +-- !query schema +struct +-- !query output +one +one +two + + +-- !query +SELECT nt1.*, nt2.* FROM nt1 inner join nt2 using (k) +-- !query schema +struct +-- !query output +one 1 one 1 +one 1 one 5 +two 2 two 22 + + +-- !query +SELECT nt1.k, nt2.k FROM nt1 inner join nt2 using (k) +-- !query schema +struct +-- !query output +one one +one one +two two + + +-- !query +SELECT k, nt1.k FROM nt1 inner join nt2 using (k) +-- !query schema +struct +-- !query output +one one +one one +two two + + +-- !query +SELECT k, nt2.k FROM nt1 inner join nt2 using (k) +-- !query schema +struct +-- !query output +one one +one one +two two From 80beda814ea7a0cadca441cf820501df596db3dc Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Fri, 26 Feb 2021 17:25:19 -0800 Subject: [PATCH 03/34] Fix behavior in Scala Signed-off-by: Karen Feng --- .../sql/catalyst/analysis/Analyzer.scala | 14 ++++++++++-- .../plans/logical/AnalysisHelper.scala | 4 +++- .../apache/spark/sql/DataFrameJoinSuite.scala | 22 +++++++++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 9518b3d4f29b2..7e0f664d33b49 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -988,9 +988,19 @@ class Analyzer(override val catalogManager: CatalogManager) object AddMetadataColumns extends Rule[LogicalPlan] { import org.apache.spark.sql.catalyst.util._ + private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = { + lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) + plan.expressions.collect { + case a: Attribute if a.isMetadataCol => a + case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) => + childMetadataOutput.find(_.exprId == a.exprId).get + } + } + private def hasMetadataCol(plan: LogicalPlan): Boolean = { + lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) plan.expressions.exists(_.find { - case a: Attribute => a.isMetadataCol + case a: Attribute => a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) case _ => false }.isDefined) } @@ -1006,7 +1016,7 @@ class Analyzer(override val catalogManager: CatalogManager) def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { case node if node.children.nonEmpty && node.resolved && hasMetadataCol(node) => val inputAttrs = AttributeSet(node.children.flatMap(_.output)) - val metaCols = node.expressions.flatMap(_.collect { + val metaCols = getMetadataAttributes(node).flatMap(_.collect { case a: Attribute if a.isMetadataCol && !inputAttrs.contains(a) => a }) if (metaCols.isEmpty) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala index 54b01416381c6..629b39afb04e7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala @@ -85,7 +85,7 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan => if (!analyzed) { AnalysisHelper.allowInvokingTransformsInAnalyzer { val afterRuleOnChildren = mapChildren(_.resolveOperatorsUp(rule)) - if (self fastEquals afterRuleOnChildren) { + val newNode = if (self fastEquals afterRuleOnChildren) { CurrentOrigin.withOrigin(origin) { rule.applyOrElse(self, identity[LogicalPlan]) } @@ -94,6 +94,8 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan => rule.applyOrElse(afterRuleOnChildren, identity[LogicalPlan]) } } + newNode.copyTagsFrom(this) + newNode } } else { self diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala index c2555a1991414..a803fa88ed313 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala @@ -477,4 +477,26 @@ class DataFrameJoinSuite extends QueryTest checkAnswer(df3.except(df4), Row(10, 50, 2, Row(10, 50, 2))) } + + test("SPARK-34527: Resolve common columns from USING JOIN") { + val joinDf = testData2.as("testData2").join( + testData3.as("testData3"), usingColumns = Seq("a"), joinType = "fullouter") + val dfQuery = joinDf.select( + $"a", $"testData2.a", $"testData2.b", $"testData3.a", $"testData3.b") + val dfQuery2 = joinDf.select( + $"a", testData2.col("a"), testData2.col("b"), testData3.col("a"), testData3.col("b")) + + Seq(dfQuery, dfQuery2).map { query => + checkAnswer(query, + Seq( + Row(1, 1, 1, 1, null), + Row(1, 1, 2, 1, null), + Row(2, 2, 1, 2, 2), + Row(2, 2, 2, 2, 2), + Row(3, 3, 1, null, null), + Row(3, 3, 2, null, null) + ) + ) + } + } } From e1719d30d7893da8991a78eb6e3fa098d65334ae Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Fri, 26 Feb 2021 21:20:45 -0800 Subject: [PATCH 04/34] Fix nested expression Signed-off-by: Karen Feng --- .../apache/spark/sql/catalyst/analysis/Analyzer.scala | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 7e0f664d33b49..82d49e7e487ba 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -990,17 +990,18 @@ class Analyzer(override val catalogManager: CatalogManager) private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = { lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) - plan.expressions.collect { + plan.expressions.flatMap(_.collect { case a: Attribute if a.isMetadataCol => a case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) => childMetadataOutput.find(_.exprId == a.exprId).get - } + }) } private def hasMetadataCol(plan: LogicalPlan): Boolean = { lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) plan.expressions.exists(_.find { - case a: Attribute => a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) + case a: Attribute => + a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) case _ => false }.isDefined) } @@ -1016,9 +1017,7 @@ class Analyzer(override val catalogManager: CatalogManager) def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { case node if node.children.nonEmpty && node.resolved && hasMetadataCol(node) => val inputAttrs = AttributeSet(node.children.flatMap(_.output)) - val metaCols = getMetadataAttributes(node).flatMap(_.collect { - case a: Attribute if a.isMetadataCol && !inputAttrs.contains(a) => a - }) + val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains) if (metaCols.isEmpty) { node } else { From 0ba1916ae587fd303abee8787d7d62fc84044623 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Thu, 4 Mar 2021 10:04:38 -0800 Subject: [PATCH 05/34] Address comments Signed-off-by: Karen Feng --- .../sql/catalyst/analysis/Analyzer.scala | 108 +++++++++--------- .../plans/logical/basicLogicalOperators.scala | 3 - .../spark/sql/catalyst/util/package.scala | 12 +- .../sql-tests/inputs/natural-join.sql | 24 ++-- .../resources/sql-tests/inputs/using-join.sql | 18 +-- .../sql-tests/results/natural-join.sql.out | 24 ++-- .../sql-tests/results/using-join.sql.out | 18 +-- 7 files changed, 103 insertions(+), 104 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 2b5b54ed1b4c2..cb17f9efc2078 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -980,7 +980,7 @@ class Analyzer(override val catalogManager: CatalogManager) * References to metadata columns are resolved using columns from [[LogicalPlan.metadataOutput]], * but the relation's output does not include the metadata columns until the relation is replaced * with a copy adding them to the output. Unless this rule adds metadata to the relation's output, - * relation's output, the analyzer will detect that nothing produces the columns. + * the analyzer will detect that nothing produces the columns. * * This rule only adds metadata columns when a node is resolved but is missing input from its * children. This ensures that metadata columns are not added to the plan unless they are used. By @@ -3295,59 +3295,6 @@ class Analyzer(override val catalogManager: CatalogManager) * Then apply a Project on a normal Join to eliminate natural or using join. */ object ResolveNaturalAndUsingJoin extends Rule[LogicalPlan] { - private def commonNaturalJoinProcessing( - left: LogicalPlan, - right: LogicalPlan, - joinType: JoinType, - joinNames: Seq[String], - condition: Option[Expression], - hint: JoinHint): LogicalPlan = { - import org.apache.spark.sql.catalyst.util._ - - val leftKeys = joinNames.map { keyName => - left.output.find(attr => resolver(attr.name, keyName)).getOrElse { - throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, left, "left") - } - } - val rightKeys = joinNames.map { keyName => - right.output.find(attr => resolver(attr.name, keyName)).getOrElse { - throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, right, "right") - } - } - val joinPairs = leftKeys.zip(rightKeys) - - val newCondition = (condition ++ joinPairs.map(EqualTo.tupled)).reduceOption(And) - - // columns not in joinPairs - val lUniqueOutput = left.output.filterNot(att => leftKeys.contains(att)) - val rUniqueOutput = right.output.filterNot(att => rightKeys.contains(att)) - - // the output list looks like: join keys, columns from left, columns from right - val (projectList, hiddenList) = joinType match { - case LeftOuter => - (leftKeys ++ lUniqueOutput ++ rUniqueOutput.map(_.withNullability(true)), rightKeys) - case LeftExistence(_) => - (leftKeys ++ lUniqueOutput, Seq.empty) - case RightOuter => - (rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ rUniqueOutput, leftKeys) - case FullOuter => - // in full outer join, joinCols should be non-null if there is. - val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, r)), l.name)() } - (joinedCols ++ - lUniqueOutput.map(_.withNullability(true)) ++ - rUniqueOutput.map(_.withNullability(true)), - leftKeys ++ rightKeys) - case _ : InnerLike => - (leftKeys ++ lUniqueOutput ++ rUniqueOutput, rightKeys) - case _ => - sys.error("Unsupported natural join type " + joinType) - } - // use Project to hide duplicated common keys - val project = Project(projectList, Join(left, right, joinType, newCondition, hint)) - project.setTagValue(project.hiddenOutputTag, hiddenList.map(_.asHiddenCol())) - project - } - override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { case j @ Join(left, right, UsingJoin(joinType, usingCols), _, hint) if left.resolved && right.resolved && j.duplicateResolved => @@ -3435,6 +3382,59 @@ class Analyzer(override val catalogManager: CatalogManager) } } + private def commonNaturalJoinProcessing( + left: LogicalPlan, + right: LogicalPlan, + joinType: JoinType, + joinNames: Seq[String], + condition: Option[Expression], + hint: JoinHint): LogicalPlan = { + import org.apache.spark.sql.catalyst.util._ + + val leftKeys = joinNames.map { keyName => + left.output.find(attr => resolver(attr.name, keyName)).getOrElse { + throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, left, "left") + } + } + val rightKeys = joinNames.map { keyName => + right.output.find(attr => resolver(attr.name, keyName)).getOrElse { + throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, right, "right") + } + } + val joinPairs = leftKeys.zip(rightKeys) + + val newCondition = (condition ++ joinPairs.map(EqualTo.tupled)).reduceOption(And) + + // columns not in joinPairs + val lUniqueOutput = left.output.filterNot(att => leftKeys.contains(att)) + val rUniqueOutput = right.output.filterNot(att => rightKeys.contains(att)) + + // the output list looks like: join keys, columns from left, columns from right + val (projectList, hiddenList) = joinType match { + case LeftOuter => + (leftKeys ++ lUniqueOutput ++ rUniqueOutput.map(_.withNullability(true)), rightKeys) + case LeftExistence(_) => + (leftKeys ++ lUniqueOutput, Seq.empty) + case RightOuter => + (rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ rUniqueOutput, leftKeys) + case FullOuter => + // in full outer join, joinCols should be non-null if there is. + val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, r)), l.name)() } + (joinedCols ++ + lUniqueOutput.map(_.withNullability(true)) ++ + rUniqueOutput.map(_.withNullability(true)), + leftKeys ++ rightKeys) + case _ : InnerLike => + (leftKeys ++ lUniqueOutput ++ rUniqueOutput, rightKeys) + case _ => + sys.error("Unsupported natural join type " + joinType) + } + // use Project to hide duplicated common keys + val project = Project(projectList, Join(left, right, joinType, newCondition, hint)) + project.setTagValue(hiddenOutputTag, hiddenList.map(_.asHiddenCol())) + project + } + /** * Replaces [[UnresolvedDeserializer]] with the deserialization expression that has been resolved * to the given input attributes. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 99f1ac1e6ff9f..d9209e1f14d18 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -25,7 +25,6 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning} -import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -78,8 +77,6 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) override lazy val validConstraints: ExpressionSet = getAllValidConstraints(projectList) - val hiddenOutputTag: TreeNodeTag[Seq[Attribute]] = TreeNodeTag[Seq[Attribute]]("hiddenOutput") - override def metadataOutput: Seq[Attribute] = { child.metadataOutput ++ getTagValue(hiddenOutputTag).getOrElse(Seq.empty[Attribute]) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala index 3ed9b6fe30d55..458512adcaa1c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala @@ -24,6 +24,7 @@ import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{MetadataBuilder, NumericType, StringType} import org.apache.spark.unsafe.types.UTF8String @@ -195,17 +196,18 @@ package object util extends Logging { } val METADATA_COL_ATTR_KEY = "__metadata_col" - implicit class MetadataColumnHelper(attr: Attribute) { - def isMetadataCol: Boolean = attr.metadata.contains(METADATA_COL_ATTR_KEY) && - attr.metadata.getBoolean(METADATA_COL_ATTR_KEY) - } /** * Hidden columns are a type of metadata column that are not propagated through subquery aliases, * and are candidates during qualified star expansions. */ val HIDDEN_COL_ATTR_KEY = "__hidden_col" - implicit class HiddenColumnHelper(attr: Attribute) { + val hiddenOutputTag: TreeNodeTag[Seq[Attribute]] = TreeNodeTag[Seq[Attribute]]("hiddenOutput") + + implicit class SpecialColumnHelper(attr: Attribute) { + def isMetadataCol: Boolean = attr.metadata.contains(METADATA_COL_ATTR_KEY) && + attr.metadata.getBoolean(METADATA_COL_ATTR_KEY) + def isHiddenCol: Boolean = attr.isMetadataCol && attr.metadata.contains(HIDDEN_COL_ATTR_KEY) && attr.metadata.getBoolean(HIDDEN_COL_ATTR_KEY) diff --git a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql index b3f654a7e918f..8247c9e054c52 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql @@ -1,20 +1,20 @@ create temporary view nt1 as select * from values - ("one", 1), - ("two", 2), - ("three", 3) - as nt1(k, v1); + ("one", 1), + ("two", 2), + ("three", 3) + as nt1(k, v1); create temporary view nt2 as select * from values - ("one", 1), - ("two", 22), - ("one", 5) - as nt2(k, v2); + ("one", 1), + ("two", 22), + ("one", 5) + as nt2(k, v2); create temporary view nt3 as select * from values - ("one", 4), - ("two", 5), - ("one", 6) - as nt3(k, v3); + ("one", 4), + ("two", 5), + ("one", 6) + as nt3(k, v3); SELECT * FROM nt1 natural join nt2; diff --git a/sql/core/src/test/resources/sql-tests/inputs/using-join.sql b/sql/core/src/test/resources/sql-tests/inputs/using-join.sql index 5d1859049acf4..336d19f0f2a3d 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/using-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/using-join.sql @@ -1,15 +1,15 @@ create temporary view nt1 as select * from values - ("one", 1), - ("two", 2), - ("three", 3) - as nt1(k, v1); + ("one", 1), + ("two", 2), + ("three", 3) + as nt1(k, v1); create temporary view nt2 as select * from values - ("one", 1), - ("two", 22), - ("one", 5), - ("four", 4) - as nt2(k, v2); + ("one", 1), + ("two", 22), + ("one", 5), + ("four", 4) + as nt2(k, v2); SELECT * FROM nt1 left outer join nt2 using (k); diff --git a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out index 4ed2c09cdc7dc..cecd1417c0064 100644 --- a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out @@ -4,10 +4,10 @@ -- !query create temporary view nt1 as select * from values - ("one", 1), - ("two", 2), - ("three", 3) - as nt1(k, v1) + ("one", 1), + ("two", 2), + ("three", 3) + as nt1(k, v1) -- !query schema struct<> -- !query output @@ -16,10 +16,10 @@ struct<> -- !query create temporary view nt2 as select * from values - ("one", 1), - ("two", 22), - ("one", 5) - as nt2(k, v2) + ("one", 1), + ("two", 22), + ("one", 5) + as nt2(k, v2) -- !query schema struct<> -- !query output @@ -28,10 +28,10 @@ struct<> -- !query create temporary view nt3 as select * from values - ("one", 4), - ("two", 5), - ("one", 6) - as nt3(k, v3) + ("one", 4), + ("two", 5), + ("one", 6) + as nt3(k, v3) -- !query schema struct<> -- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/using-join.sql.out b/sql/core/src/test/resources/sql-tests/results/using-join.sql.out index c078cbaf2eefd..1d2ae9d96ecad 100644 --- a/sql/core/src/test/resources/sql-tests/results/using-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/using-join.sql.out @@ -4,10 +4,10 @@ -- !query create temporary view nt1 as select * from values - ("one", 1), - ("two", 2), - ("three", 3) - as nt1(k, v1) + ("one", 1), + ("two", 2), + ("three", 3) + as nt1(k, v1) -- !query schema struct<> -- !query output @@ -16,11 +16,11 @@ struct<> -- !query create temporary view nt2 as select * from values - ("one", 1), - ("two", 22), - ("one", 5), - ("four", 4) - as nt2(k, v2) + ("one", 1), + ("two", 22), + ("one", 5), + ("four", 4) + as nt2(k, v2) -- !query schema struct<> -- !query output From 0c116a59aff69ce4aca4c504099abeef600f1e55 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Thu, 4 Mar 2021 15:01:20 -0800 Subject: [PATCH 06/34] Push SQL output Signed-off-by: Karen Feng --- .../results/udf/postgreSQL/udf-join.sql.out | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out index 6fd4a067c7c11..9acd4d24b5a49 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out @@ -1790,19 +1790,19 @@ SELECT udf(udf('')) AS `xxx`, udf(i), udf(j), udf(t), udf(k) -- !query schema struct -- !query output + 8 8 eight NULL + 4 1 four NULL NULL NULL null NULL + 7 7 seven NULL + 6 6 six NULL NULL 0 zero NULL 0 NULL zero NULL 1 4 one -1 2 3 two 2 2 3 two 4 3 2 three -3 - 4 1 four NULL 5 0 five -5 5 0 five -5 - 6 6 six NULL - 7 7 seven NULL - 8 8 eight NULL -- !query @@ -1812,19 +1812,19 @@ SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k) -- !query schema struct -- !query output + 8 8 eight NULL + 4 1 four NULL NULL NULL null NULL + 7 7 seven NULL + 6 6 six NULL NULL 0 zero NULL 0 NULL zero NULL 1 4 one -1 2 3 two 2 2 3 two 4 3 2 three -3 - 4 1 four NULL 5 0 five -5 5 0 five -5 - 6 6 six NULL - 7 7 seven NULL - 8 8 eight NULL -- !query From bf87f555d4bbccbc30cfa12219568ba5d2e0ed51 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Sat, 6 Mar 2021 22:32:36 -0800 Subject: [PATCH 07/34] Re-do changes Signed-off-by: Karen Feng --- .../org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 2 +- .../sql/catalyst/plans/logical/basicLogicalOperators.scala | 6 +++++- .../scala/org/apache/spark/sql/catalyst/util/package.scala | 1 - 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index f4f0abdc67229..654a20c2ef3cb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -3437,7 +3437,7 @@ class Analyzer(override val catalogManager: CatalogManager) } // use Project to hide duplicated common keys val project = Project(projectList, Join(left, right, joinType, newCondition, hint)) - project.setTagValue(hiddenOutputTag, hiddenList.map(_.asHiddenCol())) + project.setTagValue(Project.hiddenOutputTag, hiddenList.map(_.asHiddenCol())) project } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index d9209e1f14d18..4c73c03cc0708 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -79,10 +79,14 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) override def metadataOutput: Seq[Attribute] = { child.metadataOutput ++ - getTagValue(hiddenOutputTag).getOrElse(Seq.empty[Attribute]) + getTagValue(Project.hiddenOutputTag).getOrElse(Seq.empty[Attribute]) } } +object Project { + val hiddenOutputTag: TreeNodeTag[Seq[Attribute]] = TreeNodeTag[Seq[Attribute]]("hiddenOutput") +} + /** * Applies a [[Generator]] to a stream of input rows, combining the * output of each into a new stream of rows. This operation is similar to a `flatMap` in functional diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala index 458512adcaa1c..ef8a79d570651 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala @@ -202,7 +202,6 @@ package object util extends Logging { * and are candidates during qualified star expansions. */ val HIDDEN_COL_ATTR_KEY = "__hidden_col" - val hiddenOutputTag: TreeNodeTag[Seq[Attribute]] = TreeNodeTag[Seq[Attribute]]("hiddenOutput") implicit class SpecialColumnHelper(attr: Attribute) { def isMetadataCol: Boolean = attr.metadata.contains(METADATA_COL_ATTR_KEY) && From b5dc44f1352357d28650a8d2b0c81bdfea9c8035 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Sun, 7 Mar 2021 20:44:53 -0800 Subject: [PATCH 08/34] Fixup tag Signed-off-by: Karen Feng --- .../sql/catalyst/plans/logical/basicLogicalOperators.scala | 3 ++- .../scala/org/apache/spark/sql/catalyst/util/package.scala | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 4c73c03cc0708..d4a3a227a72a7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning} +import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -84,7 +85,7 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) } object Project { - val hiddenOutputTag: TreeNodeTag[Seq[Attribute]] = TreeNodeTag[Seq[Attribute]]("hiddenOutput") + val hiddenOutputTag: TreeNodeTag[Seq[Attribute]] = TreeNodeTag[Seq[Attribute]]("hidden_output") } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala index ef8a79d570651..8586963f1faec 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala @@ -24,7 +24,6 @@ import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{MetadataBuilder, NumericType, StringType} import org.apache.spark.unsafe.types.UTF8String From 7c3f5df8d079fff3464b3a94237e3ba6b1899fe0 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 8 Mar 2021 15:59:05 -0800 Subject: [PATCH 09/34] Add another failing test Signed-off-by: Karen Feng --- .../test/resources/sql-tests/inputs/natural-join.sql | 2 ++ .../resources/sql-tests/results/natural-join.sql.out | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql index 8247c9e054c52..dad70c00257e6 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql @@ -56,6 +56,8 @@ SELECT * FROM nt1 natural join nt2 natural join nt3; SELECT nt1.*, nt2.*, nt3.* FROM nt1 natural join nt2 natural join nt3; +SELECT nt1.*, nt2.*, nt3.* FROM nt1 natural join nt2 join nt3 on nt2.k = nt3.k; + SELECT * FROM nt1 natural join nt2 join nt3 on nt1.k = nt3.k; SELECT * FROM nt1 natural join nt2 join nt3 on nt2.k = nt3.k; diff --git a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out index cecd1417c0064..a12d587bca1cd 100644 --- a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 25 +-- Number of queries: 26 -- !query @@ -237,6 +237,15 @@ one 1 one 5 one 6 two 2 two 22 two 5 +-- !query +SELECT nt1.*, nt2.*, nt3.* FROM nt1 natural join nt2 join nt3 on nt2.k = nt3.k +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Resolved attribute(s) k#x missing from k#x,v1#x,v2#x,k#x,v3#x in operator !Project [k#x, v1#x, k#x, v2#x, k#x, v3#x]. Attribute(s) with the same name appear in the operation: k. Please check if the right attribute(s) are used. + + -- !query SELECT * FROM nt1 natural join nt2 join nt3 on nt1.k = nt3.k -- !query schema From 181751aa00c8d273c8c894301aef2e041f0564ee Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 29 Mar 2021 13:09:49 -0700 Subject: [PATCH 10/34] Merge add metadata and resolve missing references Signed-off-by: Karen Feng --- .../sql/catalyst/analysis/Analyzer.scala | 139 +++++++++--------- .../catalyst/plans/logical/LogicalPlan.scala | 12 +- .../results/udf/postgreSQL/udf-join.sql.out | 16 +- 3 files changed, 84 insertions(+), 83 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index d678d894f4b8f..ac92617c47e51 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -252,7 +252,6 @@ class Analyzer(override val catalogManager: CatalogManager) ResolveRelations :: ResolveTables :: ResolvePartitionSpec :: - AddMetadataColumns :: ResolveReferences :: ResolveCreateNamedStruct :: ResolveDeserializer :: @@ -974,67 +973,6 @@ class Analyzer(override val catalogManager: CatalogManager) } } - /** - * Adds metadata columns to output for child relations when nodes are missing resolved attributes. - * - * References to metadata columns are resolved using columns from [[LogicalPlan.metadataOutput]], - * but the relation's output does not include the metadata columns until the relation is replaced - * with a copy adding them to the output. Unless this rule adds metadata to the relation's output, - * the analyzer will detect that nothing produces the columns. - * - * This rule only adds metadata columns when a node is resolved but is missing input from its - * children. This ensures that metadata columns are not added to the plan unless they are used. By - * checking only resolved nodes, this ensures that * expansion is already done so that metadata - * columns are not accidentally selected by *. - */ - object AddMetadataColumns extends Rule[LogicalPlan] { - import org.apache.spark.sql.catalyst.util._ - - private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = { - lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) - plan.expressions.flatMap(_.collect { - case a: Attribute if a.isMetadataCol => a - case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) => - childMetadataOutput.find(_.exprId == a.exprId).get - }) - } - - private def hasMetadataCol(plan: LogicalPlan): Boolean = { - lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) - plan.expressions.exists(_.find { - case a: Attribute => - a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) - case _ => false - }.isDefined) - } - - private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { - case r: DataSourceV2Relation => r.withMetadataColumns() - case p: Project => p.copy( - projectList = p.metadataOutput ++ p.projectList, - child = addMetadataCol(p.child)) - case _ => plan.withNewChildren(plan.children.map(addMetadataCol)) - } - - def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { - case node if node.children.nonEmpty && node.resolved && hasMetadataCol(node) => - val inputAttrs = AttributeSet(node.children.flatMap(_.output)) - val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains) - if (metaCols.isEmpty) { - node - } else { - val newNode = addMetadataCol(node) - // We should not change the output schema of the plan. We should project away the extra - // metadata columns if necessary. - if (newNode.sameOutput(node)) { - newNode - } else { - Project(node.output, newNode) - } - } - } - } - /** * Resolve table relations with concrete relations from v2 catalog. * @@ -1964,7 +1902,7 @@ class Analyzer(override val catalogManager: CatalogManager) expr, plan, resolveColumnByName = nameParts => { - plan.resolve(nameParts, resolver) + plan.resolve(nameParts, resolver, withMetadata = false) }, resolveColumnByOrdinal = ordinal => { assert(ordinal >= 0 && ordinal < plan.output.length) @@ -2087,15 +2025,28 @@ class Analyzer(override val catalogManager: CatalogManager) } /** - * In many dialects of SQL it is valid to sort by attributes that are not present in the SELECT - * clause. This rule detects such queries and adds the required attributes to the original - * projection, so that they will be available during sorting. Another projection is added to - * remove these attributes after sorting. + * Adds missing references, including both real output and metadata output. * - * The HAVING clause could also used a grouping columns that is not presented in the SELECT. + * In many dialects of SQL it is valid to use attributes during sorting and grouping are not + * present in the SELECT clause. This rule detects such queries and adds the required attributes + * to the original projection, so that they will be available during the operation. + * Another projection is added to remove these attributes after the operation. + * + * This rule also adds metadata columns to output for child relations when nodes are missing + * resolved attributes. References to metadata columns are resolved using columns from + * [[LogicalPlan.metadataOutput]], but the relation's output does not include the metadata columns + * until the relation is replaced with a copy adding them to the output. Unless this rule adds + * metadata to the relation's output, the analyzer will detect that nothing produces the columns. + * This rule only adds metadata columns when a node is resolved but is missing input from its + * children. This ensures that metadata columns are not added to the plan unless they are used. By + * checking only resolved nodes, this ensures that * expansion is already done so that metadata + * columns are not accidentally selected by *. */ object ResolveMissingReferences extends Rule[LogicalPlan] { + import org.apache.spark.sql.catalyst.util._ + def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { + // Special cases for adding real output // Skip sort with aggregate. This will be handled in ResolveAggregateFunctions case sa @ Sort(_, _, child: Aggregate) => sa @@ -2120,13 +2071,57 @@ class Analyzer(override val catalogManager: CatalogManager) val newFilter = Filter(newCond.head, newChild) Project(child.output, newFilter) } + + // Add metadata output to all node types + case node if node.children.nonEmpty && node.resolved && node.missingInput.nonEmpty && + hasMetadataCol(node) => + val inputAttrs = AttributeSet(node.children.flatMap(_.output)) + val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains) + if (metaCols.isEmpty) { + node + } else { + val newNode = addMetadataCol(node) + // We should not change the output schema of the plan. We should project away the extra + // metadata columns if necessary. + if (newNode.sameOutput(node)) { + newNode + } else { + Project(node.output, newNode) + } + } + } + + private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = { + lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) + plan.expressions.flatMap(_.collect { + case a: Attribute if a.isMetadataCol => a + case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) => + childMetadataOutput.find(_.exprId == a.exprId).get + }) + } + + private def hasMetadataCol(plan: LogicalPlan): Boolean = { + lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) + plan.expressions.exists(_.find { + case a: Attribute => + a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) + case _ => false + }.isDefined) + } + + private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { + case r: DataSourceV2Relation => r.withMetadataColumns() + case p: Project => p.copy( + projectList = p.metadataOutput ++ p.projectList, + child = addMetadataCol(p.child)) + case _ => plan.withNewChildren(plan.children.map(addMetadataCol)) } /** - * This method tries to resolve expressions and find missing attributes recursively. Specially, - * when the expressions used in `Sort` or `Filter` contain unresolved attributes or resolved - * attributes which are missed from child output. This method tries to find the missing - * attributes out and add into the projection. + * This method tries to resolve expressions and find missing attributes recursively. + * Specifically, when the expressions used in `Sort` or `Filter` contain unresolved attributes + * or resolved attributes which are missing from child output. This method tries to find the + * missing attributes and add them into the projection. */ private def resolveExprsAndAddMissingAttrs( exprs: Seq[Expression], plan: LogicalPlan): (Seq[Expression], LogicalPlan) = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 0eff5558627b6..8ea822b5436c2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -115,9 +115,15 @@ abstract class LogicalPlan */ def resolve( nameParts: Seq[String], - resolver: Resolver): Option[NamedExpression] = - outputAttributes.resolve(nameParts, resolver) - .orElse(outputMetadataAttributes.resolve(nameParts, resolver)) + resolver: Resolver, + withMetadata: Boolean = true): Option[NamedExpression] = { + val maybeResolvedExpression = outputAttributes.resolve(nameParts, resolver) + if (withMetadata) { + maybeResolvedExpression.orElse(outputMetadataAttributes.resolve(nameParts, resolver)) + } else { + maybeResolvedExpression + } + } /** * Given an attribute name, split it to name parts by dot, but diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out index 9acd4d24b5a49..6fd4a067c7c11 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out @@ -1790,19 +1790,19 @@ SELECT udf(udf('')) AS `xxx`, udf(i), udf(j), udf(t), udf(k) -- !query schema struct -- !query output - 8 8 eight NULL - 4 1 four NULL NULL NULL null NULL - 7 7 seven NULL - 6 6 six NULL NULL 0 zero NULL 0 NULL zero NULL 1 4 one -1 2 3 two 2 2 3 two 4 3 2 three -3 + 4 1 four NULL 5 0 five -5 5 0 five -5 + 6 6 six NULL + 7 7 seven NULL + 8 8 eight NULL -- !query @@ -1812,19 +1812,19 @@ SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k) -- !query schema struct -- !query output - 8 8 eight NULL - 4 1 four NULL NULL NULL null NULL - 7 7 seven NULL - 6 6 six NULL NULL 0 zero NULL 0 NULL zero NULL 1 4 one -1 2 3 two 2 2 3 two 4 3 2 three -3 + 4 1 four NULL 5 0 five -5 5 0 five -5 + 6 6 six NULL + 7 7 seven NULL + 8 8 eight NULL -- !query From ad5e8244f61275ee1362f5edc28233b3d4821a0c Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 29 Mar 2021 15:21:00 -0700 Subject: [PATCH 11/34] Resolve down to avoid prematurely projecting out Signed-off-by: Karen Feng --- .../sql/catalyst/analysis/Analyzer.scala | 110 ++++++++++-------- .../sql-tests/results/natural-join.sql.out | 11 +- 2 files changed, 71 insertions(+), 50 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index ac92617c47e51..17697d0d0c96b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -252,6 +252,7 @@ class Analyzer(override val catalogManager: CatalogManager) ResolveRelations :: ResolveTables :: ResolvePartitionSpec :: + AddMetadataColumns :: ResolveReferences :: ResolveCreateNamedStruct :: ResolveDeserializer :: @@ -2043,7 +2044,6 @@ class Analyzer(override val catalogManager: CatalogManager) * columns are not accidentally selected by *. */ object ResolveMissingReferences extends Rule[LogicalPlan] { - import org.apache.spark.sql.catalyst.util._ def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { // Special cases for adding real output @@ -2051,7 +2051,7 @@ class Analyzer(override val catalogManager: CatalogManager) case sa @ Sort(_, _, child: Aggregate) => sa case s @ Sort(order, _, child) - if (!s.resolved || s.missingInput.nonEmpty) && child.resolved => + if (!s.resolved || s.missingInput.nonEmpty) && child.resolved => val (newOrder, newChild) = resolveExprsAndAddMissingAttrs(order, child) val ordering = newOrder.map(_.asInstanceOf[SortOrder]) if (child.output == newChild.output) { @@ -2071,50 +2071,6 @@ class Analyzer(override val catalogManager: CatalogManager) val newFilter = Filter(newCond.head, newChild) Project(child.output, newFilter) } - - // Add metadata output to all node types - case node if node.children.nonEmpty && node.resolved && node.missingInput.nonEmpty && - hasMetadataCol(node) => - val inputAttrs = AttributeSet(node.children.flatMap(_.output)) - val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains) - if (metaCols.isEmpty) { - node - } else { - val newNode = addMetadataCol(node) - // We should not change the output schema of the plan. We should project away the extra - // metadata columns if necessary. - if (newNode.sameOutput(node)) { - newNode - } else { - Project(node.output, newNode) - } - } - } - - private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = { - lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) - plan.expressions.flatMap(_.collect { - case a: Attribute if a.isMetadataCol => a - case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) => - childMetadataOutput.find(_.exprId == a.exprId).get - }) - } - - private def hasMetadataCol(plan: LogicalPlan): Boolean = { - lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) - plan.expressions.exists(_.find { - case a: Attribute => - a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) - case _ => false - }.isDefined) - } - - private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { - case r: DataSourceV2Relation => r.withMetadataColumns() - case p: Project => p.copy( - projectList = p.metadataOutput ++ p.projectList, - child = addMetadataCol(p.child)) - case _ => plan.withNewChildren(plan.children.map(addMetadataCol)) } /** @@ -2173,6 +2129,68 @@ class Analyzer(override val catalogManager: CatalogManager) } } + object AddMetadataColumns extends Rule[LogicalPlan] { + + import org.apache.spark.sql.catalyst.util._ + + def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsDown { + // Add metadata output to all node types + case node if node.children.nonEmpty && node.resolved && node.missingInput.nonEmpty && + hasMetadataCol(node) => + println(s"Trying to restore missing input ${node.missingInput}") + val inputAttrs = AttributeSet(node.children.flatMap(_.output)) + val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains) + println(s"Metadata attrs are ${getMetadataAttributes(node)}") + println(s"Input attrs are ${inputAttrs}") + if (metaCols.isEmpty) { + println(s"No meta cols") + node + } else { + val newNode = addMetadataCol(node) + println(s"Added metadata cols as ${newNode}") + // We should not change the output schema of the plan. We should project away the extra + // metadata columns if necessary. + if (newNode.sameOutput(node)) { + println(s"Same output ${newNode.output}") + newNode + } else { + println(s"From output ${node.output} to ${newNode.output}") + Project(node.output, newNode) + } + } + } + + private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = { + lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) + plan.expressions.flatMap(_.collect { + case a: Attribute if a.isMetadataCol => a + case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) => + childMetadataOutput.find(_.exprId == a.exprId).get + }) + } + + private def hasMetadataCol(plan: LogicalPlan): Boolean = { + lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) + val hasMetaCol = plan.expressions.exists(_.find { + case a: Attribute => + a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) + case _ => false + }.isDefined) + println(s"Plan $plan has metadata output: $hasMetaCol") + hasMetaCol + } + + private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { + case r: DataSourceV2Relation => r.withMetadataColumns() + case p: Project => + println(s"You find a project $p with metadata output ${p.metadataOutput}") + p.copy( + projectList = p.metadataOutput ++ p.projectList, + child = addMetadataCol(p.child)) + case _ => plan.withNewChildren(plan.children.map(addMetadataCol)) + } + } + /** * Checks whether a function identifier referenced by an [[UnresolvedFunction]] is defined in the * function registry. Note that this rule doesn't try to resolve the [[UnresolvedFunction]]. It diff --git a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out index a12d587bca1cd..cacd30728753e 100644 --- a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out @@ -19,7 +19,7 @@ create temporary view nt2 as select * from values ("one", 1), ("two", 22), ("one", 5) - as nt2(k, v2) +as nt2(k, v2) -- !query schema struct<> -- !query output @@ -240,10 +240,13 @@ two 2 two 22 two 5 -- !query SELECT nt1.*, nt2.*, nt3.* FROM nt1 natural join nt2 join nt3 on nt2.k = nt3.k -- !query schema -struct<> +struct -- !query output -org.apache.spark.sql.AnalysisException -Resolved attribute(s) k#x missing from k#x,v1#x,v2#x,k#x,v3#x in operator !Project [k#x, v1#x, k#x, v2#x, k#x, v3#x]. Attribute(s) with the same name appear in the operation: k. Please check if the right attribute(s) are used. +one 1 one 1 one 4 +one 1 one 1 one 6 +one 1 one 5 one 4 +one 1 one 5 one 6 +two 2 two 22 two 5 -- !query From e36e8534cac927b54a89fe6fa6f19c97fb7ce83f Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 29 Mar 2021 15:22:20 -0700 Subject: [PATCH 12/34] Remove printlns Signed-off-by: Karen Feng --- .../apache/spark/sql/catalyst/analysis/Analyzer.scala | 9 --------- 1 file changed, 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 17697d0d0c96b..9869312faa3a2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -2137,24 +2137,17 @@ class Analyzer(override val catalogManager: CatalogManager) // Add metadata output to all node types case node if node.children.nonEmpty && node.resolved && node.missingInput.nonEmpty && hasMetadataCol(node) => - println(s"Trying to restore missing input ${node.missingInput}") val inputAttrs = AttributeSet(node.children.flatMap(_.output)) val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains) - println(s"Metadata attrs are ${getMetadataAttributes(node)}") - println(s"Input attrs are ${inputAttrs}") if (metaCols.isEmpty) { - println(s"No meta cols") node } else { val newNode = addMetadataCol(node) - println(s"Added metadata cols as ${newNode}") // We should not change the output schema of the plan. We should project away the extra // metadata columns if necessary. if (newNode.sameOutput(node)) { - println(s"Same output ${newNode.output}") newNode } else { - println(s"From output ${node.output} to ${newNode.output}") Project(node.output, newNode) } } @@ -2176,14 +2169,12 @@ class Analyzer(override val catalogManager: CatalogManager) a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) case _ => false }.isDefined) - println(s"Plan $plan has metadata output: $hasMetaCol") hasMetaCol } private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { case r: DataSourceV2Relation => r.withMetadataColumns() case p: Project => - println(s"You find a project $p with metadata output ${p.metadataOutput}") p.copy( projectList = p.metadataOutput ++ p.projectList, child = addMetadataCol(p.child)) From 73b7c8a5c7203985ebf3299ca9e600dffca76a6f Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 29 Mar 2021 15:32:07 -0700 Subject: [PATCH 13/34] Clean up Signed-off-by: Karen Feng --- .../sql/catalyst/analysis/Analyzer.scala | 141 +++++++++--------- 1 file changed, 72 insertions(+), 69 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 9e2844e4fa9ab..aee53bb99c2fe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -995,6 +995,73 @@ class Analyzer(override val catalogManager: CatalogManager) } } + /** + * Adds metadata columns to output for child relations when nodes are missing resolved attributes. + * + * References to metadata columns are resolved using columns from [[LogicalPlan.metadataOutput]], + * but the relation's output does not include the metadata columns until the relation is replaced. + * Unless this rule adds metadata to the relation's output, the analyzer will detect that nothing + * produces the columns. + * + * This rule only adds metadata columns when a node is resolved but is missing input from its + * children. This ensures that metadata columns are not added to the plan unless they are used. By + * checking only resolved nodes, this ensures that * expansion is already done so that metadata + * columns are not accidentally selected by *. This rule resolves operators downwards to avoid + * projecting away metadata columns prematurely. + */ + object AddMetadataColumns extends Rule[LogicalPlan] { + + import org.apache.spark.sql.catalyst.util._ + + def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsDown { + // Add metadata output to all node types + case node if node.children.nonEmpty && node.resolved && node.missingInput.nonEmpty && + hasMetadataCol(node) => + val inputAttrs = AttributeSet(node.children.flatMap(_.output)) + val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains) + if (metaCols.isEmpty) { + node + } else { + val newNode = addMetadataCol(node) + // We should not change the output schema of the plan. We should project away the extra + // metadata columns if necessary. + if (newNode.sameOutput(node)) { + newNode + } else { + Project(node.output, newNode) + } + } + } + + private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = { + lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) + plan.expressions.flatMap(_.collect { + case a: Attribute if a.isMetadataCol => a + case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) => + childMetadataOutput.find(_.exprId == a.exprId).get + }) + } + + private def hasMetadataCol(plan: LogicalPlan): Boolean = { + lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) + val hasMetaCol = plan.expressions.exists(_.find { + case a: Attribute => + a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) + case _ => false + }.isDefined) + hasMetaCol + } + + private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { + case r: DataSourceV2Relation => r.withMetadataColumns() + case p: Project => + p.copy( + projectList = p.metadataOutput ++ p.projectList, + child = addMetadataCol(p.child)) + case _ => plan.withNewChildren(plan.children.map(addMetadataCol)) + } + } + /** * Resolve table relations with concrete relations from v2 catalog. * @@ -2030,27 +2097,16 @@ class Analyzer(override val catalogManager: CatalogManager) } /** - * Adds missing references, including both real output and metadata output. + * In many dialects of SQL it is valid to sort by attributes that are not present in the SELECT + * clause. This rule detects such queries and adds the required attributes to the original + * projection, so that they will be available during sorting. Another projection is added to + * remove these attributes after sorting. * - * In many dialects of SQL it is valid to use attributes during sorting and grouping are not - * present in the SELECT clause. This rule detects such queries and adds the required attributes - * to the original projection, so that they will be available during the operation. - * Another projection is added to remove these attributes after the operation. - * - * This rule also adds metadata columns to output for child relations when nodes are missing - * resolved attributes. References to metadata columns are resolved using columns from - * [[LogicalPlan.metadataOutput]], but the relation's output does not include the metadata columns - * until the relation is replaced with a copy adding them to the output. Unless this rule adds - * metadata to the relation's output, the analyzer will detect that nothing produces the columns. - * This rule only adds metadata columns when a node is resolved but is missing input from its - * children. This ensures that metadata columns are not added to the plan unless they are used. By - * checking only resolved nodes, this ensures that * expansion is already done so that metadata - * columns are not accidentally selected by *. + * The HAVING clause could also used a grouping columns that is not presented in the SELECT. */ object ResolveMissingReferences extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { - // Special cases for adding real output // Skip sort with aggregate. This will be handled in ResolveAggregateFunctions case sa @ Sort(_, _, child: Aggregate) => sa @@ -2133,59 +2189,6 @@ class Analyzer(override val catalogManager: CatalogManager) } } - object AddMetadataColumns extends Rule[LogicalPlan] { - - import org.apache.spark.sql.catalyst.util._ - - def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsDown { - // Add metadata output to all node types - case node if node.children.nonEmpty && node.resolved && node.missingInput.nonEmpty && - hasMetadataCol(node) => - val inputAttrs = AttributeSet(node.children.flatMap(_.output)) - val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains) - if (metaCols.isEmpty) { - node - } else { - val newNode = addMetadataCol(node) - // We should not change the output schema of the plan. We should project away the extra - // metadata columns if necessary. - if (newNode.sameOutput(node)) { - newNode - } else { - Project(node.output, newNode) - } - } - } - - private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = { - lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) - plan.expressions.flatMap(_.collect { - case a: Attribute if a.isMetadataCol => a - case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) => - childMetadataOutput.find(_.exprId == a.exprId).get - }) - } - - private def hasMetadataCol(plan: LogicalPlan): Boolean = { - lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) - val hasMetaCol = plan.expressions.exists(_.find { - case a: Attribute => - a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) - case _ => false - }.isDefined) - hasMetaCol - } - - private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { - case r: DataSourceV2Relation => r.withMetadataColumns() - case p: Project => - p.copy( - projectList = p.metadataOutput ++ p.projectList, - child = addMetadataCol(p.child)) - case _ => plan.withNewChildren(plan.children.map(addMetadataCol)) - } - } - /** * Checks whether a function identifier referenced by an [[UnresolvedFunction]] is defined in the * function registry. Note that this rule doesn't try to resolve the [[UnresolvedFunction]]. It From 1eb01e2b2e76f632e4aa7d82242791f2646ca4b2 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 29 Mar 2021 16:17:32 -0700 Subject: [PATCH 14/34] Style fixup Signed-off-by: Karen Feng --- .../org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index aee53bb99c2fe..96f2ba160fdc0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -2105,13 +2105,12 @@ class Analyzer(override val catalogManager: CatalogManager) * The HAVING clause could also used a grouping columns that is not presented in the SELECT. */ object ResolveMissingReferences extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { // Skip sort with aggregate. This will be handled in ResolveAggregateFunctions case sa @ Sort(_, _, child: Aggregate) => sa case s @ Sort(order, _, child) - if (!s.resolved || s.missingInput.nonEmpty) && child.resolved => + if (!s.resolved || s.missingInput.nonEmpty) && child.resolved => val (newOrder, newChild) = resolveExprsAndAddMissingAttrs(order, child) val ordering = newOrder.map(_.asInstanceOf[SortOrder]) if (child.output == newChild.output) { From 9fd24904317c38744314e54bbff00c6c94ac3477 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 29 Mar 2021 16:25:14 -0700 Subject: [PATCH 15/34] Formatting fix Signed-off-by: Karen Feng --- .../src/test/resources/sql-tests/results/natural-join.sql.out | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out index cacd30728753e..81fb50ec68093 100644 --- a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out @@ -19,7 +19,7 @@ create temporary view nt2 as select * from values ("one", 1), ("two", 22), ("one", 5) -as nt2(k, v2) + as nt2(k, v2) -- !query schema struct<> -- !query output @@ -210,7 +210,7 @@ SELECT nt2.k FROM (SELECT * FROM nt1 natural join nt2) struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve '`nt2.k`' given input columns: [__auto_generated_subquery_name.k, __auto_generated_subquery_name.v1, __auto_generated_subquery_name.v2]; line 1 pos 7 +cannot resolve 'nt2.k' given input columns: [__auto_generated_subquery_name.k, __auto_generated_subquery_name.v1, __auto_generated_subquery_name.v2]; line 1 pos 7 -- !query From f5cc3ae0db9538083c5efb27f3641c4dddec5faa Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 30 Mar 2021 11:29:16 -0700 Subject: [PATCH 16/34] Address comments Signed-off-by: Karen Feng --- .../spark/sql/catalyst/analysis/Analyzer.scala | 15 ++++++++------- .../sql/catalyst/plans/logical/LogicalPlan.scala | 12 +++++++++--- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 96f2ba160fdc0..dd87fff6e6f6d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1015,8 +1015,7 @@ class Analyzer(override val catalogManager: CatalogManager) def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsDown { // Add metadata output to all node types - case node if node.children.nonEmpty && node.resolved && node.missingInput.nonEmpty && - hasMetadataCol(node) => + case node if node.children.nonEmpty && node.resolved && hasMetadataCol(node) => val inputAttrs = AttributeSet(node.children.flatMap(_.output)) val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains) if (metaCols.isEmpty) { @@ -1745,7 +1744,7 @@ class Analyzer(override val catalogManager: CatalogManager) case q: LogicalPlan => logTrace(s"Attempting to resolve ${q.simpleString(conf.maxToStringFields)}") - q.mapExpressions(resolveExpressionByPlanChildren(_, q)) + q.mapExpressions(resolveExpressionByPlanChildren(_, q, withMetadata = true)) } def resolveAssignments( @@ -1982,11 +1981,12 @@ class Analyzer(override val catalogManager: CatalogManager) def resolveExpressionByPlanOutput( expr: Expression, plan: LogicalPlan, - throws: Boolean = false): Expression = { + throws: Boolean = false, + withMetadata: Boolean = false): Expression = { resolveExpression( expr, resolveColumnByName = nameParts => { - plan.resolve(nameParts, resolver, withMetadata = false) + plan.resolve(nameParts, resolver, withMetadata = withMetadata) }, getAttrCandidates = () => plan.output, throws = throws) @@ -2002,11 +2002,12 @@ class Analyzer(override val catalogManager: CatalogManager) */ def resolveExpressionByPlanChildren( e: Expression, - q: LogicalPlan): Expression = { + q: LogicalPlan, + withMetadata: Boolean = false): Expression = { resolveExpression( e, resolveColumnByName = nameParts => { - q.resolveChildren(nameParts, resolver) + q.resolveChildren(nameParts, resolver, withMetadata = withMetadata) }, getAttrCandidates = () => { assert(q.children.length == 1) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 8ea822b5436c2..ffa79cc6dc860 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -104,9 +104,15 @@ abstract class LogicalPlan */ def resolveChildren( nameParts: Seq[String], - resolver: Resolver): Option[NamedExpression] = - childAttributes.resolve(nameParts, resolver) - .orElse(childMetadataAttributes.resolve(nameParts, resolver)) + resolver: Resolver, + withMetadata: Boolean = true): Option[NamedExpression] = { + val maybeResolvedExpression = childAttributes.resolve(nameParts, resolver) + if (withMetadata) { + maybeResolvedExpression.orElse(childMetadataAttributes.resolve(nameParts, resolver)) + } else { + maybeResolvedExpression + } + } /** * Optionally resolves the given strings to a [[NamedExpression]] based on the output of this From fa7207e87c042dc66cc16c93b8936e7f80e952c3 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 31 Mar 2021 15:16:50 -0700 Subject: [PATCH 17/34] Metadata output should be empty by default Signed-off-by: Karen Feng --- .../plans/logical/EventTimeWatermark.scala | 2 + .../catalyst/plans/logical/LogicalPlan.scala | 7 +- .../plans/logical/basicLogicalOperators.scala | 44 ++++++++++-- .../sql/catalyst/plans/logical/hints.scala | 3 + .../sql/catalyst/plans/logical/object.scala | 6 ++ .../logical/pythonLogicalOperators.scala | 2 + .../sql/connector/DataSourceV2SQLSuite.scala | 67 +++++++++++++++++++ 7 files changed, 125 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala index b6bf7cd85d472..a591a9e160a8d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala @@ -61,4 +61,6 @@ case class EventTimeWatermark( a } } + + override val metadataOutput: Seq[Attribute] = child.metadataOutput } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index ffa79cc6dc860..69bd5edac37bb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -33,8 +33,11 @@ abstract class LogicalPlan with QueryPlanConstraints with Logging { - /** Metadata fields that can be projected from this node */ - def metadataOutput: Seq[Attribute] = children.flatMap(_.metadataOutput) + /** + * Metadata fields that can be projected from this node. + * Should be non-empty if the plan propagates its children's output. + */ + def metadataOutput: Seq[Attribute] = Nil /** Returns true if this subtree has data from a streaming data source. */ def isStreaming: Boolean = children.exists(_.isStreaming) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 4bf7f5b85750b..a77701408fceb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.types._ import org.apache.spark.util.random.RandomSampler /** - * When planning take() or collect() operations, this special node that is inserted at the top of + * When planning take() or collect() operations, this special node is inserted at the top of * the logical plan before invoking the query planner. * * Rules can pattern-match on this node in order to apply transformations that only take effect @@ -41,6 +41,7 @@ import org.apache.spark.util.random.RandomSampler case class ReturnAnswer(child: LogicalPlan) extends UnaryNode { override def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -52,6 +53,7 @@ case class ReturnAnswer(child: LogicalPlan) extends UnaryNode { */ case class Subquery(child: LogicalPlan, correlated: Boolean) extends OrderPreservingUnaryNode { override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput } object Subquery { @@ -117,10 +119,10 @@ case class Generate( child: LogicalPlan) extends UnaryNode { - lazy val requiredChildOutput: Seq[Attribute] = { - val unrequiredSet = unrequiredChildIndex.toSet + val unrequiredSet: Set[Int] = unrequiredChildIndex.toSet + + lazy val requiredChildOutput: Seq[Attribute] = child.output.zipWithIndex.filterNot(t => unrequiredSet.contains(t._2)).map(_._1) - } override lazy val resolved: Boolean = { generator.resolved && @@ -144,11 +146,14 @@ case class Generate( } def output: Seq[Attribute] = requiredChildOutput ++ qualifiedGeneratorOutput + override def metadataOutput: Seq[Attribute] = + child.metadataOutput.zipWithIndex.filterNot(t => unrequiredSet.contains(t._2)).map(_._1) } case class Filter(condition: Expression, child: LogicalPlan) extends OrderPreservingUnaryNode with PredicateHelper { override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput override def maxRows: Option[Long] = child.maxRows @@ -197,6 +202,8 @@ case class Intersect( leftAttr.withNullability(leftAttr.nullable && rightAttr.nullable) } + override def metadataOutput: Seq[Attribute] = children.flatMap(_.metadataOutput) + override protected lazy val validConstraints: ExpressionSet = leftConstraints.union(rightConstraints) @@ -216,6 +223,7 @@ case class Except( override def nodeName: String = getClass.getSimpleName + ( if ( isAll ) "All" else "" ) /** We don't use right.output because those rows get excluded from the set. */ override def output: Seq[Attribute] = left.output + override def metadataOutput: Seq[Attribute] = left.metadataOutput override protected lazy val validConstraints: ExpressionSet = leftConstraints } @@ -280,6 +288,8 @@ case class Union( } } + override def metadataOutput: Seq[Attribute] = children.flatMap(_.metadataOutput) + override lazy val resolved: Boolean = { // allChildrenCompatible needs to be evaluated after childrenResolved def allChildrenCompatible: Boolean = @@ -374,6 +384,17 @@ case class Join( } } + override def metadataOutput: Seq[Attribute] = { + joinType match { + case j: ExistenceJoin => + left.metadataOutput + case LeftExistence(_) => + left.metadataOutput + case _ => + children.flatMap(_.metadataOutput) + } + } + override protected lazy val validConstraints: ExpressionSet = { joinType match { case _: InnerLike if condition.isDefined => @@ -476,6 +497,8 @@ case class View( override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.output + override def simpleString(maxFields: Int): String = { s"View (${desc.identifier}, ${output.mkString("[", ",", "]")})" } @@ -530,6 +553,8 @@ object View { case class With(child: LogicalPlan, cteRelations: Seq[(String, SubqueryAlias)]) extends UnaryNode { override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput + override def simpleString(maxFields: Int): String = { val cteAliases = truncatedString(cteRelations.map(_._1), "[", ", ", "]", maxFields) s"CTE $cteAliases" @@ -542,6 +567,7 @@ case class WithWindowDefinition( windowDefinitions: Map[String, WindowSpecDefinition], child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -555,6 +581,7 @@ case class Sort( global: Boolean, child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput override def maxRows: Option[Long] = child.maxRows override def outputOrdering: Seq[SortOrder] = order } @@ -679,6 +706,7 @@ case class Window( override def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output ++ windowExpressions.map(_.toAttribute) + override def metadataOutput: Seq[Attribute] = child.metadataOutput override def producedAttributes: AttributeSet = windowOutputSet @@ -897,6 +925,7 @@ object Limit { */ case class GlobalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderPreservingUnaryNode { override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput override def maxRows: Option[Long] = { limitExpr match { case IntegerLiteral(limit) => Some(limit) @@ -913,6 +942,7 @@ case class GlobalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderP */ case class LocalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderPreservingUnaryNode { override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput override def maxRowsPerPartition: Option[Long] = { limitExpr match { @@ -934,6 +964,7 @@ case class LocalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderPr */ case class Tail(limitExpr: Expression, child: LogicalPlan) extends OrderPreservingUnaryNode { override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput override def maxRows: Option[Long] = { limitExpr match { case IntegerLiteral(limit) => Some(limit) @@ -1019,6 +1050,7 @@ case class Sample( override def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -1027,6 +1059,7 @@ case class Sample( case class Distinct(child: LogicalPlan) extends UnaryNode { override def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -1037,6 +1070,7 @@ abstract class RepartitionOperation extends UnaryNode { def numPartitions: Int override final def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -1120,6 +1154,7 @@ case class Deduplicate( child: LogicalPlan) extends UnaryNode { override def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -1148,4 +1183,5 @@ case class CollectMetrics( } override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala index 4b5e278fccdfb..8130d5ecebc6c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala @@ -31,6 +31,7 @@ case class UnresolvedHint(name: String, parameters: Seq[Any], child: LogicalPlan override lazy val resolved: Boolean = false override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -42,6 +43,8 @@ case class ResolvedHint(child: LogicalPlan, hints: HintInfo = HintInfo()) override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput + override def doCanonicalize(): LogicalPlan = child.canonicalized } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala index d383532cbd3d3..569ef3bd882c8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala @@ -238,6 +238,8 @@ case class TypedFilter( override def output: Seq[Attribute] = child.output + override def metadataOutput: Seq[Attribute] = child.metadataOutput + def withObjectProducerChild(obj: LogicalPlan): Filter = { assert(obj.output.length == 1) Filter(typedCondition(obj.output.head), obj) @@ -333,6 +335,8 @@ case class AppendColumns( override def output: Seq[Attribute] = child.output ++ newColumns + override def metadataOutput: Seq[Attribute] = child.metadataOutput + def newColumns: Seq[Attribute] = serializer.map(_.toAttribute) } @@ -346,6 +350,8 @@ case class AppendColumnsWithObject( child: LogicalPlan) extends ObjectConsumer { override def output: Seq[Attribute] = (childSerializer ++ newColumnsSerializer).map(_.toAttribute) + + override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** Factory for constructing new `MapGroups` nodes. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala index c4f741cd2cec8..4acddb1ff2de3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala @@ -74,6 +74,8 @@ trait BaseEvalPython extends UnaryNode { override def output: Seq[Attribute] = child.output ++ resultAttrs + override def metadataOutput: Seq[Attribute] = child.metadataOutput + override def producedAttributes: AttributeSet = AttributeSet(resultAttrs) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index c4abed32bf624..7ac635c060f70 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -2690,6 +2690,73 @@ class DataSourceV2SQLSuite } } + test("SPARK-34923: do not propagate metadata columns through Project") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " + + "PARTITIONED BY (bucket(4, id), id)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')") + + assertThrows[AnalysisException] { + sql(s"SELECT index, _partition from (SELECT id, data FROM $t1)") + } + assertThrows[AnalysisException] { + spark.table(t1).select("id", "data").select("index", "_partition") + } + } + } + + test("SPARK-34923: propagate metadata columns through Filter") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " + + "PARTITIONED BY (bucket(4, id), id)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')") + + val sqlQuery = spark.sql(s"SELECT id, data, index, _partition FROM $t1 WHERE id > 1") + val dfQuery = spark.table(t1).where("id > 1").select("id", "data", "index", "_partition") + + Seq(sqlQuery, dfQuery).foreach { query => + checkAnswer(query, Seq(Row(2, "b", 0, "0/2"), Row(3, "c", 0, "1/3"))) + } + } + } + + test("SPARK-34923: propagate metadata columns through Sort") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " + + "PARTITIONED BY (bucket(4, id), id)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')") + + val sqlQuery = spark.sql(s"SELECT id, data, index, _partition FROM $t1 ORDER BY id") + val dfQuery = spark.table(t1).orderBy("id").select("id", "data", "index", "_partition") + + Seq(sqlQuery, dfQuery).foreach { query => + checkAnswer(query, Seq(Row(1, "a", 0, "3/1"), Row(2, "b", 0, "0/2"), Row(3, "c", 0, "1/3"))) + } + } + } + + test("SPARK-34923: propagate metadata columns through RepartitionBy") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " + + "PARTITIONED BY (bucket(4, id), id)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')") + + val sqlQuery = spark.sql( + s"SELECT /*+ REPARTITION_BY_RANGE(3, id) */ id, data, index, _partition FROM $t1") + val tbl = spark.table(t1) + val dfQuery = tbl.repartitionByRange(3, tbl.col("id")) + .select("id", "data", "index", "_partition") + + Seq(sqlQuery, dfQuery).foreach { query => + checkAnswer(query, Seq(Row(1, "a", 0, "3/1"), Row(2, "b", 0, "0/2"), Row(3, "c", 0, "1/3"))) + } + } + } + private def testNotSupportedV2Command(sqlCommand: String, sqlParams: String): Unit = { val e = intercept[AnalysisException] { sql(s"$sqlCommand $sqlParams") From 7af12ae7907fb59b8f28ea6fdfd48712c27925be Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 31 Mar 2021 15:22:05 -0700 Subject: [PATCH 18/34] Clean up Signed-off-by: Karen Feng --- .../catalyst/plans/logical/basicLogicalOperators.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index a77701408fceb..b0404fd375651 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -121,8 +121,10 @@ case class Generate( val unrequiredSet: Set[Int] = unrequiredChildIndex.toSet - lazy val requiredChildOutput: Seq[Attribute] = + lazy val requiredChildOutput: Seq[Attribute] = { + val unrequiredSet = unrequiredChildIndex.toSet child.output.zipWithIndex.filterNot(t => unrequiredSet.contains(t._2)).map(_._1) + } override lazy val resolved: Boolean = { generator.resolved && @@ -146,8 +148,7 @@ case class Generate( } def output: Seq[Attribute] = requiredChildOutput ++ qualifiedGeneratorOutput - override def metadataOutput: Seq[Attribute] = - child.metadataOutput.zipWithIndex.filterNot(t => unrequiredSet.contains(t._2)).map(_._1) + override def metadataOutput: Seq[Attribute] = child.metadataOutput } case class Filter(condition: Expression, child: LogicalPlan) @@ -386,7 +387,7 @@ case class Join( override def metadataOutput: Seq[Attribute] = { joinType match { - case j: ExistenceJoin => + case ExistenceJoin(_) => left.metadataOutput case LeftExistence(_) => left.metadataOutput From 07f9ad583127c19662384e3f1c31ae29cb444583 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 31 Mar 2021 16:18:42 -0700 Subject: [PATCH 19/34] Always resolve with metadata Signed-off-by: Karen Feng --- .../sql/catalyst/analysis/Analyzer.scala | 12 +++++----- .../catalyst/plans/logical/LogicalPlan.scala | 22 +++++-------------- .../plans/logical/basicLogicalOperators.scala | 6 ++--- 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index dd87fff6e6f6d..c23b7521fc29a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1744,7 +1744,7 @@ class Analyzer(override val catalogManager: CatalogManager) case q: LogicalPlan => logTrace(s"Attempting to resolve ${q.simpleString(conf.maxToStringFields)}") - q.mapExpressions(resolveExpressionByPlanChildren(_, q, withMetadata = true)) + q.mapExpressions(resolveExpressionByPlanChildren(_, q)) } def resolveAssignments( @@ -1981,12 +1981,11 @@ class Analyzer(override val catalogManager: CatalogManager) def resolveExpressionByPlanOutput( expr: Expression, plan: LogicalPlan, - throws: Boolean = false, - withMetadata: Boolean = false): Expression = { + throws: Boolean = false): Expression = { resolveExpression( expr, resolveColumnByName = nameParts => { - plan.resolve(nameParts, resolver, withMetadata = withMetadata) + plan.resolve(nameParts, resolver) }, getAttrCandidates = () => plan.output, throws = throws) @@ -2002,12 +2001,11 @@ class Analyzer(override val catalogManager: CatalogManager) */ def resolveExpressionByPlanChildren( e: Expression, - q: LogicalPlan, - withMetadata: Boolean = false): Expression = { + q: LogicalPlan): Expression = { resolveExpression( e, resolveColumnByName = nameParts => { - q.resolveChildren(nameParts, resolver, withMetadata = withMetadata) + q.resolveChildren(nameParts, resolver) }, getAttrCandidates = () => { assert(q.children.length == 1) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 69bd5edac37bb..3ebaf8efb295a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -107,14 +107,9 @@ abstract class LogicalPlan */ def resolveChildren( nameParts: Seq[String], - resolver: Resolver, - withMetadata: Boolean = true): Option[NamedExpression] = { - val maybeResolvedExpression = childAttributes.resolve(nameParts, resolver) - if (withMetadata) { - maybeResolvedExpression.orElse(childMetadataAttributes.resolve(nameParts, resolver)) - } else { - maybeResolvedExpression - } + resolver: Resolver): Option[NamedExpression] = { + childAttributes.resolve(nameParts, resolver) + .orElse(childMetadataAttributes.resolve(nameParts, resolver)) } /** @@ -124,14 +119,9 @@ abstract class LogicalPlan */ def resolve( nameParts: Seq[String], - resolver: Resolver, - withMetadata: Boolean = true): Option[NamedExpression] = { - val maybeResolvedExpression = outputAttributes.resolve(nameParts, resolver) - if (withMetadata) { - maybeResolvedExpression.orElse(outputMetadataAttributes.resolve(nameParts, resolver)) - } else { - maybeResolvedExpression - } + resolver: Resolver): Option[NamedExpression] = { + outputAttributes.resolve(nameParts, resolver) + .orElse(outputMetadataAttributes.resolve(nameParts, resolver)) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index b0404fd375651..398af402d8b49 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -80,10 +80,8 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) override lazy val validConstraints: ExpressionSet = getAllValidConstraints(projectList) - override def metadataOutput: Seq[Attribute] = { - child.metadataOutput ++ - getTagValue(Project.hiddenOutputTag).getOrElse(Seq.empty[Attribute]) - } + override def metadataOutput: Seq[Attribute] = + getTagValue(Project.hiddenOutputTag).getOrElse(Seq.empty[Attribute]) } object Project { From 0f267e797b53202ef25d360d50b2f09624c5dd73 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 6 Apr 2021 14:52:26 -0700 Subject: [PATCH 20/34] Revert accidental metadata changes Signed-off-by: Karen Feng --- .../plans/logical/EventTimeWatermark.scala | 2 -- .../plans/logical/basicLogicalOperators.scala | 19 ------------------- .../sql/catalyst/plans/logical/hints.scala | 3 --- .../sql/catalyst/plans/logical/object.scala | 6 ------ .../logical/pythonLogicalOperators.scala | 2 -- 5 files changed, 32 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala index a591a9e160a8d..b6bf7cd85d472 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala @@ -61,6 +61,4 @@ case class EventTimeWatermark( a } } - - override val metadataOutput: Seq[Attribute] = child.metadataOutput } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 62851844175e9..4643d72ccc992 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -41,7 +41,6 @@ import org.apache.spark.util.random.RandomSampler case class ReturnAnswer(child: LogicalPlan) extends UnaryNode { override def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -53,7 +52,6 @@ case class ReturnAnswer(child: LogicalPlan) extends UnaryNode { */ case class Subquery(child: LogicalPlan, correlated: Boolean) extends OrderPreservingUnaryNode { override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput } object Subquery { @@ -118,8 +116,6 @@ case class Generate( child: LogicalPlan) extends UnaryNode { - val unrequiredSet: Set[Int] = unrequiredChildIndex.toSet - lazy val requiredChildOutput: Seq[Attribute] = { val unrequiredSet = unrequiredChildIndex.toSet child.output.zipWithIndex.filterNot(t => unrequiredSet.contains(t._2)).map(_._1) @@ -147,13 +143,11 @@ case class Generate( } def output: Seq[Attribute] = requiredChildOutput ++ qualifiedGeneratorOutput - override def metadataOutput: Seq[Attribute] = child.metadataOutput } case class Filter(condition: Expression, child: LogicalPlan) extends OrderPreservingUnaryNode with PredicateHelper { override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput override def maxRows: Option[Long] = child.maxRows @@ -223,7 +217,6 @@ case class Except( override def nodeName: String = getClass.getSimpleName + ( if ( isAll ) "All" else "" ) /** We don't use right.output because those rows get excluded from the set. */ override def output: Seq[Attribute] = left.output - override def metadataOutput: Seq[Attribute] = left.metadataOutput override def metadataOutput: Seq[Attribute] = Nil @@ -556,8 +549,6 @@ object View { case class With(child: LogicalPlan, cteRelations: Seq[(String, SubqueryAlias)]) extends UnaryNode { override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput - override def simpleString(maxFields: Int): String = { val cteAliases = truncatedString(cteRelations.map(_._1), "[", ", ", "]", maxFields) s"CTE $cteAliases" @@ -570,7 +561,6 @@ case class WithWindowDefinition( windowDefinitions: Map[String, WindowSpecDefinition], child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -584,7 +574,6 @@ case class Sort( global: Boolean, child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput override def maxRows: Option[Long] = child.maxRows override def outputOrdering: Seq[SortOrder] = order } @@ -710,7 +699,6 @@ case class Window( override def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output ++ windowExpressions.map(_.toAttribute) - override def metadataOutput: Seq[Attribute] = child.metadataOutput override def producedAttributes: AttributeSet = windowOutputSet @@ -906,7 +894,6 @@ object Limit { */ case class GlobalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderPreservingUnaryNode { override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput override def maxRows: Option[Long] = { limitExpr match { case IntegerLiteral(limit) => Some(limit) @@ -923,7 +910,6 @@ case class GlobalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderP */ case class LocalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderPreservingUnaryNode { override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput override def maxRowsPerPartition: Option[Long] = { limitExpr match { @@ -945,7 +931,6 @@ case class LocalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderPr */ case class Tail(limitExpr: Expression, child: LogicalPlan) extends OrderPreservingUnaryNode { override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput override def maxRows: Option[Long] = { limitExpr match { case IntegerLiteral(limit) => Some(limit) @@ -1031,7 +1016,6 @@ case class Sample( override def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -1040,7 +1024,6 @@ case class Sample( case class Distinct(child: LogicalPlan) extends UnaryNode { override def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -1145,7 +1128,6 @@ case class Deduplicate( child: LogicalPlan) extends UnaryNode { override def maxRows: Option[Long] = child.maxRows override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -1174,5 +1156,4 @@ case class CollectMetrics( } override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala index 8130d5ecebc6c..4b5e278fccdfb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala @@ -31,7 +31,6 @@ case class UnresolvedHint(name: String, parameters: Seq[Any], child: LogicalPlan override lazy val resolved: Boolean = false override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** @@ -43,8 +42,6 @@ case class ResolvedHint(child: LogicalPlan, hints: HintInfo = HintInfo()) override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput - override def doCanonicalize(): LogicalPlan = child.canonicalized } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala index 569ef3bd882c8..d383532cbd3d3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala @@ -238,8 +238,6 @@ case class TypedFilter( override def output: Seq[Attribute] = child.output - override def metadataOutput: Seq[Attribute] = child.metadataOutput - def withObjectProducerChild(obj: LogicalPlan): Filter = { assert(obj.output.length == 1) Filter(typedCondition(obj.output.head), obj) @@ -335,8 +333,6 @@ case class AppendColumns( override def output: Seq[Attribute] = child.output ++ newColumns - override def metadataOutput: Seq[Attribute] = child.metadataOutput - def newColumns: Seq[Attribute] = serializer.map(_.toAttribute) } @@ -350,8 +346,6 @@ case class AppendColumnsWithObject( child: LogicalPlan) extends ObjectConsumer { override def output: Seq[Attribute] = (childSerializer ++ newColumnsSerializer).map(_.toAttribute) - - override def metadataOutput: Seq[Attribute] = child.metadataOutput } /** Factory for constructing new `MapGroups` nodes. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala index 438d05d0a151a..62f2d598b96dc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala @@ -80,8 +80,6 @@ trait BaseEvalPython extends UnaryNode { override def output: Seq[Attribute] = child.output ++ resultAttrs - override def metadataOutput: Seq[Attribute] = child.metadataOutput - override def producedAttributes: AttributeSet = AttributeSet(resultAttrs) } From ed0270c9a9ae6865eb50d028fa95ab3bee9e6b45 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 6 Apr 2021 14:53:15 -0700 Subject: [PATCH 21/34] Revert DatasourceV2SQLSuite Signed-off-by: Karen Feng --- .../sql/connector/DataSourceV2SQLSuite.scala | 86 +++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 75bdf0cf764a3..e0dcdc794b7c2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -42,7 +42,7 @@ import org.apache.spark.util.Utils class DataSourceV2SQLSuite extends InsertIntoTests(supportsDynamicOverwrite = true, includeSQLOnlyTests = true) - with AlterTableTests with DatasourceV2SQLBase { + with AlterTableTests with DatasourceV2SQLBase { import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ @@ -128,9 +128,9 @@ class DataSourceV2SQLSuite val descriptionDf = spark.sql("DESCRIBE TABLE EXTENDED testcat.table_name") assert(descriptionDf.schema.map(field => (field.name, field.dataType)) === Seq( - ("col_name", StringType), - ("data_type", StringType), - ("comment", StringType))) + ("col_name", StringType), + ("data_type", StringType), + ("comment", StringType))) assert(descriptionDf.collect() .map(_.toSeq) .map(_.toArray.map(_.toString.trim)) === Array( @@ -480,7 +480,7 @@ class DataSourceV2SQLSuite } assert(!testCatalog.tableExists(Identifier.of(Array(), "table_name")), - "Table should have been dropped as a result of the replace.") + "Table should have been dropped as a result of the replace.") } test("ReplaceTableAsSelect: Non-atomic catalog drops the table permanently if the" + @@ -537,9 +537,9 @@ class DataSourceV2SQLSuite val replaced = testCatalog.loadTable(Identifier.of(Array(), "table_name")) assert(replaced.asInstanceOf[InMemoryTable].rows.isEmpty, - "Replaced table should have no rows after committing.") + "Replaced table should have no rows after committing.") assert(replaced.schema().fields.length === 1, - "Replaced table should have new schema.") + "Replaced table should have new schema.") assert(replaced.schema().fields(0) === StructField("id", LongType, nullable = false), "Replaced table should have new schema.") } @@ -597,8 +597,8 @@ class DataSourceV2SQLSuite assert(table.partitioning.isEmpty) assert(table.properties == withDefaultOwnership(Map("provider" -> v2Source)).asJava) assert(table.schema == new StructType() - .add("id", LongType) - .add("data", StringType)) + .add("id", LongType) + .add("data", StringType)) val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) @@ -614,8 +614,8 @@ class DataSourceV2SQLSuite assert(table.partitioning.isEmpty) assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) assert(table.schema == new StructType() - .add("id", LongType) - .add("data", StringType)) + .add("id", LongType) + .add("data", StringType)) val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) @@ -634,8 +634,8 @@ class DataSourceV2SQLSuite assert(table2.partitioning.isEmpty) assert(table2.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) assert(table2.schema == new StructType() - .add("id", LongType) - .add("data", StringType)) + .add("id", LongType) + .add("data", StringType)) val rdd2 = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) checkAnswer(spark.internalCreateDataFrame(rdd2, table.schema), spark.table("source")) @@ -652,8 +652,8 @@ class DataSourceV2SQLSuite assert(table.partitioning.isEmpty) assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) assert(table.schema == new StructType() - .add("id", LongType) - .add("data", StringType)) + .add("id", LongType) + .add("data", StringType)) val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) @@ -683,8 +683,8 @@ class DataSourceV2SQLSuite assert(table.partitioning.isEmpty) assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) assert(table.schema == new StructType() - .add("id", LongType) - .add("data", StringType)) + .add("id", LongType) + .add("data", StringType)) val rdd = sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) @@ -870,8 +870,8 @@ class DataSourceV2SQLSuite sql(s"CREATE TABLE $t1 USING foo AS SELECT id, data FROM source") checkAnswer( sql(s""" - |WITH cte AS (SELECT * FROM $t1) - |SELECT * FROM cte + |WITH cte AS (SELECT * FROM $t1) + |SELECT * FROM cte """.stripMargin), spark.table("source")) } @@ -900,9 +900,9 @@ class DataSourceV2SQLSuite val df_joined = df1.join(df2).where(df1("id") + 1 === df2("id")) checkAnswer( sql(s""" - |SELECT * - |FROM $t1 t1, $t2 t2 - |WHERE t1.id + 1 = t2.id + |SELECT * + |FROM $t1 t1, $t2 t2 + |WHERE t1.id + 1 = t2.id """.stripMargin), df_joined) } @@ -1285,8 +1285,8 @@ class DataSourceV2SQLSuite } private def testShowNamespaces( - sqlText: String, - expected: Seq[String]): Unit = { + sqlText: String, + expected: Seq[String]): Unit = { val schema = new StructType().add("namespace", StringType, nullable = false) val df = spark.sql(sqlText) @@ -1349,7 +1349,7 @@ class DataSourceV2SQLSuite } test("SPARK-31100: Use: v2 catalog that implements SupportsNamespaces is used " + - "and namespace not exists") { + "and namespace not exists") { // Namespaces are required to exist for v2 catalogs that implements SupportsNamespaces. val exception = intercept[NoSuchNamespaceException] { sql("USE testcat.ns1.ns2") @@ -1358,7 +1358,7 @@ class DataSourceV2SQLSuite } test("SPARK-31100: Use: v2 catalog that does not implement SupportsNameSpaces is used " + - "and namespace does not exist") { + "and namespace does not exist") { // Namespaces are not required to exist for v2 catalogs // that does not implement SupportsNamespaces. withSQLConf("spark.sql.catalog.dummy" -> classOf[BasicInMemoryTableCatalog].getName) { @@ -1735,7 +1735,7 @@ class DataSourceV2SQLSuite val v1Table = "tbl" withTable(v1Table) { sql(s"CREATE TABLE $v1Table" + - s" USING ${classOf[SimpleScanSource].getName} OPTIONS (from=0,to=1)") + s" USING ${classOf[SimpleScanSource].getName} OPTIONS (from=0,to=1)") val exc = intercept[AnalysisException] { sql(s"DELETE FROM $v1Table WHERE i = 2") } @@ -2412,10 +2412,10 @@ class DataSourceV2SQLSuite df.createOrReplaceTempView("source_view") sql(s"CREATE TABLE $t1 (ts timestamp, data string) " + - s"USING $v2Format PARTITIONED BY (days(ts))") + s"USING $v2Format PARTITIONED BY (days(ts))") sql(s"INSERT INTO $t1 VALUES " + - s"(CAST(date_add('2020-01-01', 2) AS timestamp), 'dummy'), " + - s"(CAST(date_add('2020-01-01', 4) AS timestamp), 'keep')") + s"(CAST(date_add('2020-01-01', 2) AS timestamp), 'dummy'), " + + s"(CAST(date_add('2020-01-01', 4) AS timestamp), 'keep')") sql(s"INSERT OVERWRITE TABLE $t1 SELECT ts, data FROM source_view") val expected = spark.createDataFrame(Seq( @@ -2433,7 +2433,7 @@ class DataSourceV2SQLSuite val t1 = s"${catalogAndNamespace}table" withTable(t1) { sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " + - "PARTITIONED BY (bucket(4, id), id)") + "PARTITIONED BY (bucket(4, id), id)") sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')") val sqlQuery = spark.sql(s"SELECT id, data, index, _partition FROM $t1") @@ -2449,7 +2449,7 @@ class DataSourceV2SQLSuite val t1 = s"${catalogAndNamespace}table" withTable(t1) { sql(s"CREATE TABLE $t1 (index bigint, data string) USING $v2Format " + - "PARTITIONED BY (bucket(4, index), index)") + "PARTITIONED BY (bucket(4, index), index)") sql(s"INSERT INTO $t1 VALUES (3, 'c'), (2, 'b'), (1, 'a')") val sqlQuery = spark.sql(s"SELECT index, data, _partition FROM $t1") @@ -2465,7 +2465,7 @@ class DataSourceV2SQLSuite val t1 = s"${catalogAndNamespace}table" withTable(t1) { sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " + - "PARTITIONED BY (bucket(4, id), id)") + "PARTITIONED BY (bucket(4, id), id)") sql(s"INSERT INTO $t1 VALUES (3, 'c'), (2, 'b'), (1, 'a')") val sqlQuery = spark.sql(s"SELECT * FROM $t1") @@ -2527,9 +2527,9 @@ class DataSourceV2SQLSuite val t = "testpart.ns1.ns2.tbl" withTable(t) { sql(s""" - |CREATE TABLE $t (id bigint, city string, data string) - |USING foo - |PARTITIONED BY (id, city)""".stripMargin) + |CREATE TABLE $t (id bigint, city string, data string) + |USING foo + |PARTITIONED BY (id, city)""".stripMargin) val partTable = catalog("testpart").asTableCatalog .loadTable(Identifier.of(Array("ns1", "ns2"), "tbl")).asInstanceOf[InMemoryPartitionTable] val expectedPartitionIdent = InternalRow.fromSeq(Seq(1, UTF8String.fromString("NY"))) @@ -2544,10 +2544,10 @@ class DataSourceV2SQLSuite test("View commands are not supported in v2 catalogs") { def validateViewCommand( - sql: String, - catalogName: String, - viewName: String, - cmdName: String): Unit = { + sql: String, + catalogName: String, + viewName: String, + cmdName: String): Unit = { assertAnalysisError( sql, s"Cannot specify catalog `$catalogName` for view $viewName because view support " + @@ -2576,9 +2576,9 @@ class DataSourceV2SQLSuite val t = "testpart.ns1.ns2.tbl" withTable(t) { sql(s""" - |CREATE TABLE $t (id bigint, city string, data string) - |USING foo - |PARTITIONED BY (id, city)""".stripMargin) + |CREATE TABLE $t (id bigint, city string, data string) + |USING foo + |PARTITIONED BY (id, city)""".stripMargin) val partTable = catalog("testpart").asTableCatalog .loadTable(Identifier.of(Array("ns1", "ns2"), "tbl")).asInstanceOf[InMemoryPartitionTable] From 66ad572bee29e606d2c9e85db28c5a35f1d2d022 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 6 Apr 2021 14:54:45 -0700 Subject: [PATCH 22/34] Revert unneeded style changes Signed-off-by: Karen Feng --- .../spark/sql/catalyst/plans/logical/LogicalPlan.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 2527dd1d60881..cef0e06c357f0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -108,10 +108,9 @@ abstract class LogicalPlan */ def resolveChildren( nameParts: Seq[String], - resolver: Resolver): Option[NamedExpression] = { + resolver: Resolver): Option[NamedExpression] = childAttributes.resolve(nameParts, resolver) .orElse(childMetadataAttributes.resolve(nameParts, resolver)) - } /** * Optionally resolves the given strings to a [[NamedExpression]] based on the output of this @@ -120,10 +119,9 @@ abstract class LogicalPlan */ def resolve( nameParts: Seq[String], - resolver: Resolver): Option[NamedExpression] = { + resolver: Resolver): Option[NamedExpression] = outputAttributes.resolve(nameParts, resolver) .orElse(outputMetadataAttributes.resolve(nameParts, resolver)) - } /** * Given an attribute name, split it to name parts by dot, but From fc3b16df5291151cfd5c7a3f5407222868148317 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 6 Apr 2021 16:11:02 -0700 Subject: [PATCH 23/34] Revert accidental metadata output change Signed-off-by: Karen Feng --- .../spark/sql/catalyst/plans/logical/basicLogicalOperators.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 4643d72ccc992..71ce2f553f098 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -62,7 +62,6 @@ object Subquery { case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extends OrderPreservingUnaryNode { override def output: Seq[Attribute] = projectList.map(_.toAttribute) - override def metadataOutput: Seq[Attribute] = Nil override def maxRows: Option[Long] = child.maxRows override lazy val resolved: Boolean = { From f665030c32af34b2d865d1c40e0da1bdc302741c Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 6 Apr 2021 19:20:26 -0700 Subject: [PATCH 24/34] Add childrens' hidden output to Project metadataOutput Signed-off-by: Karen Feng --- .../sql/catalyst/plans/logical/basicLogicalOperators.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 71ce2f553f098..767f73163ca81 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -79,7 +79,8 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) getAllValidConstraints(projectList) override def metadataOutput: Seq[Attribute] = - getTagValue(Project.hiddenOutputTag).getOrElse(Seq.empty[Attribute]) + child.metadataOutput.filter(_.isHiddenCol) ++ + getTagValue(Project.hiddenOutputTag).getOrElse(Seq.empty[Attribute]) } object Project { From 85b81b1558188ba86fb3a7442da34addbb44aa9c Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 6 Apr 2021 19:55:45 -0700 Subject: [PATCH 25/34] Retrigger tests Signed-off-by: Karen Feng From eab7964456ad3d3178a550a072268bf98e08c5a3 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 7 Apr 2021 10:23:57 -0700 Subject: [PATCH 26/34] Add comments Signed-off-by: Karen Feng --- .../scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 2 ++ .../main/scala/org/apache/spark/sql/catalyst/util/package.scala | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 33aad7acd7f6d..dfe62b5c92a79 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -961,6 +961,8 @@ class Analyzer(override val catalogManager: CatalogManager) lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) val hasMetaCol = plan.expressions.exists(_.find { case a: Attribute => + // If an attribute is resolved before being labeled as metadata + // (i.e. from the originating Dataset), we check with expression ID a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) case _ => false }.isDefined) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala index b01e4407c9dc0..6c109754fe5af 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala @@ -205,7 +205,7 @@ package object util extends Logging { val METADATA_COL_ATTR_KEY = "__metadata_col" /** - * Hidden columns are a type of metadata column that are not propagated through subquery aliases, + * Hidden columns are a type of metadata column that are propagated through Project * and are candidates during qualified star expansions. */ val HIDDEN_COL_ATTR_KEY = "__hidden_col" From c84f396e3427ffceb2891e2b1dc4b30ddc2239d0 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 7 Apr 2021 13:22:21 -0700 Subject: [PATCH 27/34] Address comments and fix propagation through Projects Signed-off-by: Karen Feng --- .../plans/logical/basicLogicalOperators.scala | 11 +++++-- .../spark/sql/catalyst/util/package.scala | 4 +-- .../sql-tests/inputs/natural-join.sql | 8 +++++ .../sql-tests/results/natural-join.sql.out | 30 ++++++++++++++++++- 4 files changed, 47 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 0a6486f557905..e0079e2724ccb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -79,9 +79,14 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) override lazy val validConstraints: ExpressionSet = getAllValidConstraints(projectList) - override def metadataOutput: Seq[Attribute] = - child.metadataOutput.filter(_.isHiddenCol) ++ - getTagValue(Project.hiddenOutputTag).getOrElse(Seq.empty[Attribute]) + override def metadataOutput: Seq[Attribute] = { + val hiddenOutput = getTagValue(Project.hiddenOutputTag) + if (hiddenOutput.isDefined) { + child.metadataOutput.filter(_.isHiddenCol) ++ hiddenOutput.get + } else { + Nil + } + } } object Project { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala index 6c109754fe5af..86d7248070466 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala @@ -205,8 +205,8 @@ package object util extends Logging { val METADATA_COL_ATTR_KEY = "__metadata_col" /** - * Hidden columns are a type of metadata column that are propagated through Project - * and are candidates during qualified star expansions. + * Hidden columns are a type of metadata column that are candidates during qualified star + * star expansions. They are propagated through Projects that have hidden output. */ val HIDDEN_COL_ATTR_KEY = "__hidden_col" diff --git a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql index dad70c00257e6..33a33abd2f758 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql @@ -16,6 +16,12 @@ create temporary view nt3 as select * from values ("one", 6) as nt3(k, v3); +create temporary view nt4 as select * from values + ("one", 7), + ("two", 8), + ("one", 9) + as nt4(k, v4); + SELECT * FROM nt1 natural join nt2; SELECT * FROM nt1 natural join nt2 where k = "one"; @@ -61,3 +67,5 @@ SELECT nt1.*, nt2.*, nt3.* FROM nt1 natural join nt2 join nt3 on nt2.k = nt3.k; SELECT * FROM nt1 natural join nt2 join nt3 on nt1.k = nt3.k; SELECT * FROM nt1 natural join nt2 join nt3 on nt2.k = nt3.k; + +SELECT nt1.*, nt2.*, nt3.*, nt4.* FROM nt1 natural join nt2 natural join nt3 natural join nt4; diff --git a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out index 81fb50ec68093..39e575a7e35e5 100644 --- a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 26 +-- Number of queries: 28 -- !query @@ -38,6 +38,18 @@ struct<> +-- !query +create temporary view nt4 as select * from values + ("one", 7), + ("two", 8), + ("one", 9) + as nt4(k, v4) +-- !query schema +struct<> +-- !query output + + + -- !query SELECT * FROM nt1 natural join nt2 -- !query schema @@ -271,3 +283,19 @@ one 1 1 one 6 one 1 5 one 4 one 1 5 one 6 two 2 22 two 5 + + +-- !query +SELECT nt1.*, nt2.*, nt3.*, nt4.* FROM nt1 natural join nt2 natural join nt3 natural join nt4 +-- !query schema +struct +-- !query output +one 1 one 1 one 4 one 7 +one 1 one 1 one 4 one 9 +one 1 one 1 one 6 one 7 +one 1 one 1 one 6 one 9 +one 1 one 5 one 4 one 7 +one 1 one 5 one 4 one 9 +one 1 one 5 one 6 one 7 +one 1 one 5 one 6 one 9 +two 2 two 22 two 5 two 8 From 0fe04a2e014d90da13d5f2a8d4a456836b1ba6ad Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 7 Apr 2021 14:18:04 -0700 Subject: [PATCH 28/34] Retrigger tests From c7c3df63592ad31c61e5a4c684a8d9aa3906f5e5 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 7 Apr 2021 15:16:11 -0700 Subject: [PATCH 29/34] Retrigger tests From b1bf28d9ca413cf2dee2957e5d6d8eeb4c9a4f6e Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Wed, 7 Apr 2021 21:39:31 -0700 Subject: [PATCH 30/34] Retrigger tests From 47be66dc8c9408e666493b55c6a240fcb9036e85 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Fri, 9 Apr 2021 09:44:25 -0700 Subject: [PATCH 31/34] Wording Signed-off-by: Karen Feng --- .../scala/org/apache/spark/sql/catalyst/util/package.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala index 86d7248070466..b9560d6c8f9cc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala @@ -206,7 +206,8 @@ package object util extends Logging { /** * Hidden columns are a type of metadata column that are candidates during qualified star - * star expansions. They are propagated through Projects that have hidden output. + * star expansions. They are propagated through Projects that have hidden children output, + * so that nested hidden output is not lost. */ val HIDDEN_COL_ATTR_KEY = "__hidden_col" From 333a81550ea6e20b1243fb642529e1fceb9853e6 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 12 Apr 2021 10:39:10 -0700 Subject: [PATCH 32/34] propagate hidden columns from nested NATURAL/USING JOINs Signed-off-by: Karen Feng --- .../apache/spark/sql/catalyst/analysis/Analyzer.scala | 5 ++++- .../catalyst/plans/logical/basicLogicalOperators.scala | 10 ++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 89b2f8506f82b..312e64288793e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -3199,8 +3199,11 @@ class Analyzer(override val catalogManager: CatalogManager) sys.error("Unsupported natural join type " + joinType) } // use Project to hide duplicated common keys + // propagate hidden columns from nested USING/NATURAL JOINs val project = Project(projectList, Join(left, right, joinType, newCondition, hint)) - project.setTagValue(Project.hiddenOutputTag, hiddenList.map(_.asHiddenCol())) + project.setTagValue( + Project.hiddenOutputTag, + hiddenList.map(_.asHiddenCol()) ++ project.child.metadataOutput.filter(_.isHiddenCol)) project } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 9d973ee334562..d96b094c38f6a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -83,14 +83,8 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) override lazy val validConstraints: ExpressionSet = getAllValidConstraints(projectList) - override def metadataOutput: Seq[Attribute] = { - val hiddenOutput = getTagValue(Project.hiddenOutputTag) - if (hiddenOutput.isDefined) { - child.metadataOutput.filter(_.isHiddenCol) ++ hiddenOutput.get - } else { - Nil - } - } + override def metadataOutput: Seq[Attribute] = + getTagValue(Project.hiddenOutputTag).getOrElse(Nil) override protected def withNewChildInternal(newChild: LogicalPlan): Project = copy(child = newChild) From 9e62d7d3aa90906d5c8883c73dd71f81e319d97f Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Mon, 12 Apr 2021 16:29:17 -0700 Subject: [PATCH 33/34] Retrigger tests Signed-off-by: Karen Feng From 446d4bc5a65160ae8d8934afd6abd0d5df413374 Mon Sep 17 00:00:00 2001 From: Karen Feng Date: Tue, 13 Apr 2021 10:53:42 -0700 Subject: [PATCH 34/34] address comments Signed-off-by: Karen Feng --- .../sql/catalyst/analysis/Analyzer.scala | 19 +++++++++------- .../sql/catalyst/analysis/unresolved.scala | 2 +- .../plans/logical/basicLogicalOperators.scala | 2 +- .../spark/sql/catalyst/util/package.scala | 22 +++++++++---------- .../sql-tests/inputs/natural-join.sql | 2 ++ .../sql-tests/results/natural-join.sql.out | 12 +++++++++- 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index a3ef54ecb58aa..80cf2865bd11f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -948,24 +948,26 @@ class Analyzer(override val catalogManager: CatalogManager) } private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = { - lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) plan.expressions.flatMap(_.collect { case a: Attribute if a.isMetadataCol => a - case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) => - childMetadataOutput.find(_.exprId == a.exprId).get + case a: Attribute + if plan.children.exists(c => c.metadataOutput.exists(_.exprId == a.exprId)) => + plan.children.collectFirst { + case c if c.metadataOutput.exists(_.exprId == a.exprId) => + c.metadataOutput.find(_.exprId == a.exprId).get + }.get }) } private def hasMetadataCol(plan: LogicalPlan): Boolean = { - lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) - val hasMetaCol = plan.expressions.exists(_.find { + plan.expressions.exists(_.find { case a: Attribute => // If an attribute is resolved before being labeled as metadata // (i.e. from the originating Dataset), we check with expression ID - a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) + a.isMetadataCol || + plan.children.exists(c => c.metadataOutput.exists(_.exprId == a.exprId)) case _ => false }.isDefined) - hasMetaCol } private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { @@ -3209,7 +3211,8 @@ class Analyzer(override val catalogManager: CatalogManager) val project = Project(projectList, Join(left, right, joinType, newCondition, hint)) project.setTagValue( Project.hiddenOutputTag, - hiddenList.map(_.asHiddenCol()) ++ project.child.metadataOutput.filter(_.isHiddenCol)) + hiddenList.map(_.markAsSupportsQualifiedStar()) ++ + project.child.metadataOutput.filter(_.supportsQualifiedStar)) project } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index 2e3866aa18fe9..5001e2ea88ac7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -370,7 +370,7 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu if (target.isEmpty) return input.output // If there is a table specified, use hidden input attributes as well - val hiddenOutput = input.metadataOutput.filter(_.isHiddenCol) + val hiddenOutput = input.metadataOutput.filter(_.supportsQualifiedStar) val expandedAttributes = (hiddenOutput ++ input.output).filter( matchedQualifier(_, target.get, resolver)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index f4e4a7a4cd2a9..14b6b7b0d5c1a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -1084,7 +1084,7 @@ case class SubqueryAlias( override def metadataOutput: Seq[Attribute] = { val qualifierList = identifier.qualifier :+ alias - child.metadataOutput.filterNot(_.isHiddenCol).map(_.withQualifier(qualifierList)) + child.metadataOutput.map(_.withQualifier(qualifierList)) } override def doCanonicalize(): LogicalPlan = child.canonicalized diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala index b9560d6c8f9cc..33fe48d44dadb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala @@ -204,26 +204,24 @@ package object util extends Logging { val METADATA_COL_ATTR_KEY = "__metadata_col" - /** - * Hidden columns are a type of metadata column that are candidates during qualified star - * star expansions. They are propagated through Projects that have hidden children output, - * so that nested hidden output is not lost. - */ - val HIDDEN_COL_ATTR_KEY = "__hidden_col" + implicit class MetadataColumnHelper(attr: Attribute) { + /** + * If set, this metadata column is a candidate during qualified star expansions. + */ + val SUPPORTS_QUALIFIED_STAR = "__supports_qualified_star" - implicit class SpecialColumnHelper(attr: Attribute) { def isMetadataCol: Boolean = attr.metadata.contains(METADATA_COL_ATTR_KEY) && attr.metadata.getBoolean(METADATA_COL_ATTR_KEY) - def isHiddenCol: Boolean = attr.isMetadataCol && - attr.metadata.contains(HIDDEN_COL_ATTR_KEY) && - attr.metadata.getBoolean(HIDDEN_COL_ATTR_KEY) + def supportsQualifiedStar: Boolean = attr.isMetadataCol && + attr.metadata.contains(SUPPORTS_QUALIFIED_STAR) && + attr.metadata.getBoolean(SUPPORTS_QUALIFIED_STAR) - def asHiddenCol(): Attribute = attr.withMetadata( + def markAsSupportsQualifiedStar(): Attribute = attr.withMetadata( new MetadataBuilder() .withMetadata(attr.metadata) .putBoolean(METADATA_COL_ATTR_KEY, true) - .putBoolean(HIDDEN_COL_ATTR_KEY, true) + .putBoolean(SUPPORTS_QUALIFIED_STAR, true) .build() ) } diff --git a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql index 33a33abd2f758..060f15e3d2e87 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql @@ -42,6 +42,8 @@ SELECT nt2.* FROM nt1 natural join nt2; SELECT sbq.* from (SELECT * FROM nt1 natural join nt2) sbq; +SELECT sbq.k from (SELECT * FROM nt1 natural join nt2) sbq; + SELECT nt1.*, nt2.* FROM nt1 natural join nt2; SELECT *, nt2.k FROM nt1 natural join nt2; diff --git a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out index 39e575a7e35e5..794e4725d9ea4 100644 --- a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 28 +-- Number of queries: 29 -- !query @@ -147,6 +147,16 @@ one 1 5 two 2 22 +-- !query +SELECT sbq.k from (SELECT * FROM nt1 natural join nt2) sbq +-- !query schema +struct +-- !query output +one +one +two + + -- !query SELECT nt1.*, nt2.* FROM nt1 natural join nt2 -- !query schema