apache · karenfeng · Feb 26, 2021 · Feb 26, 2021 · Feb 26, 2021 · Feb 27, 2021
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -979,7 +979,7 @@ class Analyzer(override val catalogManager: CatalogManager)
    *
    * References to metadata columns are resolved using columns from [[LogicalPlan.metadataOutput]],
    * but the relation's output does not include the metadata columns until the relation is replaced
-   * using [[DataSourceV2Relation.withMetadataColumns()]]. Unless this rule adds metadata to the
+   * with a copy adding them to the output. Unless this rule adds metadata to the relation's output,
    * relation's output, the analyzer will detect that nothing produces the columns.
    *
    * This rule only adds metadata columns when a node is resolved but is missing input from its
@@ -988,31 +988,43 @@ class Analyzer(override val catalogManager: CatalogManager)
    * columns are not accidentally selected by *.
    */
   object AddMetadataColumns extends Rule[LogicalPlan] {
-    import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._
+    import org.apache.spark.sql.catalyst.util._
+
+    private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = {
+      lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput)
+      plan.expressions.flatMap(_.collect {
+        case a: Attribute if a.isMetadataCol => a
+        case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) =>
+          childMetadataOutput.find(_.exprId == a.exprId).get
+      })
+    }
 
     private def hasMetadataCol(plan: LogicalPlan): Boolean = {
+      lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput)
       plan.expressions.exists(_.find {
-        case a: Attribute => a.isMetadataCol
+        case a: Attribute =>
+          a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId)
         case _ => false
       }.isDefined)
     }
 
     private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match {
       case r: DataSourceV2Relation => r.withMetadataColumns()
+      case p: Project => p.copy(
+        projectList = p.metadataOutput ++ p.projectList,
+        child = addMetadataCol(p.child))
       case _ => plan.withNewChildren(plan.children.map(addMetadataCol))
     }
 
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp {
       case node if node.children.nonEmpty && node.resolved && hasMetadataCol(node) =>
         val inputAttrs = AttributeSet(node.children.flatMap(_.output))
-        val metaCols = node.expressions.flatMap(_.collect {
-          case a: Attribute if a.isMetadataCol && !inputAttrs.contains(a) => a
-        })
+        val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains)
         if (metaCols.isEmpty) {
           node
         } else {
           val newNode = addMetadataCol(node)
-          // We should not change the output schema of the plan. We should project away the extr
+          // We should not change the output schema of the plan. We should project away the extra
           // metadata columns if necessary.
           if (newNode.sameOutput(node)) {
             newNode
@@ -3283,6 +3295,59 @@ class Analyzer(override val catalogManager: CatalogManager)
    * Then apply a Project on a normal Join to eliminate natural or using join.
    */
   object ResolveNaturalAndUsingJoin extends Rule[LogicalPlan] {
+    private def commonNaturalJoinProcessing(
+        left: LogicalPlan,
+        right: LogicalPlan,
+        joinType: JoinType,
+        joinNames: Seq[String],
+        condition: Option[Expression],
+        hint: JoinHint): LogicalPlan = {
+      import org.apache.spark.sql.catalyst.util._
+
+      val leftKeys = joinNames.map { keyName =>
+        left.output.find(attr => resolver(attr.name, keyName)).getOrElse {
+          throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, left, "left")
+        }
+      }
+      val rightKeys = joinNames.map { keyName =>
+        right.output.find(attr => resolver(attr.name, keyName)).getOrElse {
+          throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, right, "right")
+        }
+      }
+      val joinPairs = leftKeys.zip(rightKeys)
+
+      val newCondition = (condition ++ joinPairs.map(EqualTo.tupled)).reduceOption(And)
+
+      // columns not in joinPairs
+      val lUniqueOutput = left.output.filterNot(att => leftKeys.contains(att))
+      val rUniqueOutput = right.output.filterNot(att => rightKeys.contains(att))
+
+      // the output list looks like: join keys, columns from left, columns from right
+      val (projectList, hiddenList) = joinType match {
+        case LeftOuter =>
+          (leftKeys ++ lUniqueOutput ++ rUniqueOutput.map(_.withNullability(true)), rightKeys)
+        case LeftExistence(_) =>
+          (leftKeys ++ lUniqueOutput, Seq.empty)
+        case RightOuter =>
+          (rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ rUniqueOutput, leftKeys)
+        case FullOuter =>
+          // in full outer join, joinCols should be non-null if there is.
+          val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, r)), l.name)() }
+          (joinedCols ++
+            lUniqueOutput.map(_.withNullability(true)) ++
+            rUniqueOutput.map(_.withNullability(true)),
+            leftKeys ++ rightKeys)
+        case _ : InnerLike =>
+          (leftKeys ++ lUniqueOutput ++ rUniqueOutput, rightKeys)
+        case _ =>
+          sys.error("Unsupported natural join type " + joinType)
+      }
+      // use Project to hide duplicated common keys
+      val project = Project(projectList, Join(left, right, joinType, newCondition, hint))
+      project.setTagValue(project.hiddenOutputTag, hiddenList.map(_.asHiddenCol()))
+      project
+    }
+
     override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
       case j @ Join(left, right, UsingJoin(joinType, usingCols), _, hint)
           if left.resolved && right.resolved && j.duplicateResolved =>
@@ -3370,54 +3435,6 @@ class Analyzer(override val catalogManager: CatalogManager)
     }
   }
 
-  private def commonNaturalJoinProcessing(
-      left: LogicalPlan,
-      right: LogicalPlan,
-      joinType: JoinType,
-      joinNames: Seq[String],
-      condition: Option[Expression],
-      hint: JoinHint) = {
-    val leftKeys = joinNames.map { keyName =>
-      left.output.find(attr => resolver(attr.name, keyName)).getOrElse {
-        throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, left, "left")
-      }
-    }
-    val rightKeys = joinNames.map { keyName =>
-      right.output.find(attr => resolver(attr.name, keyName)).getOrElse {
-        throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, right, "right")
-      }
-    }
-    val joinPairs = leftKeys.zip(rightKeys)
-
-    val newCondition = (condition ++ joinPairs.map(EqualTo.tupled)).reduceOption(And)
-
-    // columns not in joinPairs
-    val lUniqueOutput = left.output.filterNot(att => leftKeys.contains(att))
-    val rUniqueOutput = right.output.filterNot(att => rightKeys.contains(att))
-
-    // the output list looks like: join keys, columns from left, columns from right
-    val projectList = joinType match {
-      case LeftOuter =>
-        leftKeys ++ lUniqueOutput ++ rUniqueOutput.map(_.withNullability(true))
-      case LeftExistence(_) =>
-        leftKeys ++ lUniqueOutput
-      case RightOuter =>
-        rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ rUniqueOutput
-      case FullOuter =>
-        // in full outer join, joinCols should be non-null if there is.
-        val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, r)), l.name)() }
-        joinedCols ++
-          lUniqueOutput.map(_.withNullability(true)) ++
-          rUniqueOutput.map(_.withNullability(true))
-      case _ : InnerLike =>
-        leftKeys ++ lUniqueOutput ++ rUniqueOutput
-      case _ =>
-        sys.error("Unsupported natural join type " + joinType)
-    }
-    // use Project to trim unnecessary fields
-    Project(projectList, Join(left, right, joinType, newCondition, hint))
-  }
-
   /**
    * Replaces [[UnresolvedDeserializer]] with the deserialization expression that has been resolved
    * to the given input attributes.

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.catalyst.parser.ParserUtils
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, UnaryNode}
-import org.apache.spark.sql.catalyst.util.quoteIdentifier
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog}
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
 import org.apache.spark.sql.types.{DataType, Metadata, StructType}
@@ -316,11 +316,11 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu
    * Returns true if the nameParts is a subset of the last elements of qualifier of the attribute.
    *
    * For example, the following should all return true:
-   *   - `SELECT ns1.ns2.t.* FROM ns1.n2.t` where nameParts is Seq("ns1", "ns2", "t") and
+   *   - `SELECT ns1.ns2.t.* FROM ns1.ns2.t` where nameParts is Seq("ns1", "ns2", "t") and
    *     qualifier is Seq("ns1", "ns2", "t").
-   *   - `SELECT ns2.t.* FROM ns1.n2.t` where nameParts is Seq("ns2", "t") and
+   *   - `SELECT ns2.t.* FROM ns1.ns2.t` where nameParts is Seq("ns2", "t") and
    *     qualifier is Seq("ns1", "ns2", "t").
-   *   - `SELECT t.* FROM ns1.n2.t` where nameParts is Seq("t") and
+   *   - `SELECT t.* FROM ns1.ns2.t` where nameParts is Seq("t") and
    *     qualifier is Seq("ns1", "ns2", "t").
    */
   private def matchedQualifier(
@@ -342,10 +342,13 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu
   override def expand(
       input: LogicalPlan,
       resolver: Resolver): Seq[NamedExpression] = {
-    // If there is no table specified, use all input attributes.
+    // If there is no table specified, use all non-hidden input attributes.
     if (target.isEmpty) return input.output
 
-    val expandedAttributes = input.output.filter(matchedQualifier(_, target.get, resolver))
+    // If there is a table specified, use hidden input attributes as well
+    val hiddenOutput = input.metadataOutput.filter(_.isHiddenCol)
+    val expandedAttributes = (hiddenOutput ++ input.output).filter(
+      matchedQualifier(_, target.get, resolver))
 
     if (expandedAttributes.nonEmpty) return expandedAttributes
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala
@@ -85,7 +85,7 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan =>
     if (!analyzed) {
       AnalysisHelper.allowInvokingTransformsInAnalyzer {
         val afterRuleOnChildren = mapChildren(_.resolveOperatorsUp(rule))
-        if (self fastEquals afterRuleOnChildren) {
+        val newNode = if (self fastEquals afterRuleOnChildren) {
           CurrentOrigin.withOrigin(origin) {
             rule.applyOrElse(self, identity[LogicalPlan])
           }
@@ -94,6 +94,8 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan =>
             rule.applyOrElse(afterRuleOnChildren, identity[LogicalPlan])
           }
         }
+        newNode.copyTagsFrom(this)
+        newNode
       }
     } else {
       self

diff --git a/...st/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/...st/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -25,7 +25,8 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning}
-import org.apache.spark.sql.catalyst.util.truncatedString
+import org.apache.spark.sql.catalyst.trees.TreeNodeTag
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.random.RandomSampler
@@ -76,6 +77,13 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan)
 
   override lazy val validConstraints: ExpressionSet =
     getAllValidConstraints(projectList)
+
+  val hiddenOutputTag: TreeNodeTag[Seq[Attribute]] = TreeNodeTag[Seq[Attribute]]("hiddenOutput")
+
+  override def metadataOutput: Seq[Attribute] = {
+    child.metadataOutput ++
+      getTagValue(hiddenOutputTag).getOrElse(Seq.empty[Attribute])
+  }
 }
 
 /**
@@ -950,7 +958,7 @@ case class SubqueryAlias(
 
   override def metadataOutput: Seq[Attribute] = {
     val qualifierList = identifier.qualifier :+ alias
-    child.metadataOutput.map(_.withQualifier(qualifierList))
+    child.metadataOutput.filterNot(_.isHiddenCol).map(_.withQualifier(qualifierList))
   }
 
   override def doCanonicalize(): LogicalPlan = child.canonicalized

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
@@ -25,7 +25,7 @@ import java.util.concurrent.atomic.AtomicBoolean
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{NumericType, StringType}
+import org.apache.spark.sql.types.{MetadataBuilder, NumericType, StringType}
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
 
@@ -193,4 +193,29 @@ package object util extends Logging {
   def truncatedString[T](seq: Seq[T], sep: String, maxFields: Int): String = {
     truncatedString(seq, "", sep, "", maxFields)
   }
+
+  val METADATA_COL_ATTR_KEY = "__metadata_col"
+  implicit class MetadataColumnHelper(attr: Attribute) {
+    def isMetadataCol: Boolean = attr.metadata.contains(METADATA_COL_ATTR_KEY) &&
+      attr.metadata.getBoolean(METADATA_COL_ATTR_KEY)
+  }
+
+  /**
+   * Hidden columns are a type of metadata column that are not propagated through subquery aliases,
+   * and are candidates during qualified star expansions.
+   */
+  val HIDDEN_COL_ATTR_KEY = "__hidden_col"
+  implicit class HiddenColumnHelper(attr: Attribute) {
+    def isHiddenCol: Boolean = attr.isMetadataCol &&
+      attr.metadata.contains(HIDDEN_COL_ATTR_KEY) &&
+      attr.metadata.getBoolean(HIDDEN_COL_ATTR_KEY)
+
+    def asHiddenCol(): Attribute = attr.withMetadata(
+      new MetadataBuilder()
+        .withMetadata(attr.metadata)
+        .putBoolean(METADATA_COL_ATTR_KEY, true)
+        .putBoolean(HIDDEN_COL_ATTR_KEY, true)
+        .build()
+    )
+  }
 }
diff --git a/.../src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Implicits.scala b/.../src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Implicits.scala
@@ -21,14 +21,13 @@ import scala.collection.JavaConverters._
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis.{PartitionSpec, ResolvedPartitionSpec, UnresolvedPartitionSpec}
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.util.METADATA_COL_ATTR_KEY
 import org.apache.spark.sql.connector.catalog.{MetadataColumn, SupportsAtomicPartitionManagement, SupportsDelete, SupportsPartitionManagement, SupportsRead, SupportsWrite, Table, TableCapability, TruncatableTable}
 import org.apache.spark.sql.types.{MetadataBuilder, StructField, StructType}
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 
 object DataSourceV2Implicits {
-  private val METADATA_COL_ATTR_KEY = "__metadata_col"
-
   implicit class TableHelper(table: Table) {
     def asReadable: SupportsRead = {
       table match {
@@ -103,11 +102,6 @@ object DataSourceV2Implicits {
     def toAttributes: Seq[AttributeReference] = asStruct.toAttributes
   }
 
-  implicit class MetadataColumnHelper(attr: Attribute) {
-    def isMetadataCol: Boolean = attr.metadata.contains(METADATA_COL_ATTR_KEY) &&
-      attr.metadata.getBoolean(METADATA_COL_ATTR_KEY)
-  }
-
   implicit class OptionsHelper(options: Map[String, String]) {
     def asOptions: CaseInsensitiveStringMap = {
       new CaseInsensitiveStringMap(options.asJava)