-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-34527][SQL] Resolve duplicated common columns from USING/NATURAL JOIN #31666
Changes from 42 commits
1c5ab03
2fe733f
2c261bb
80beda8
e1719d3
6fa70ba
0ba1916
2b7e730
0c116a5
bf87f55
b5dc44f
6e32b3d
7c3f5df
181751a
ad5e824
e36e853
db44c53
73b7c8a
1eb01e2
9fd2490
f5cc3ae
fa7207e
7af12ae
07f9ad5
c474745
0f267e7
ed0270c
66ad572
fc3b16d
f665030
85b81b1
eab7964
44ee9f8
c84f396
0fe04a2
c7c3df6
b1bf28d
47be66d
8c5144e
333a815
49de5c5
9e62d7d
446d4bc
8f70c2d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -914,41 +914,30 @@ class Analyzer(override val catalogManager: CatalogManager) | |
* Adds metadata columns to output for child relations when nodes are missing resolved attributes. | ||
* | ||
* References to metadata columns are resolved using columns from [[LogicalPlan.metadataOutput]], | ||
* but the relation's output does not include the metadata columns until the relation is replaced | ||
* using [[DataSourceV2Relation.withMetadataColumns()]]. Unless this rule adds metadata to the | ||
* relation's output, the analyzer will detect that nothing produces the columns. | ||
* but the relation's output does not include the metadata columns until the relation is replaced. | ||
* Unless this rule adds metadata to the relation's output, the analyzer will detect that nothing | ||
* produces the columns. | ||
* | ||
* This rule only adds metadata columns when a node is resolved but is missing input from its | ||
* children. This ensures that metadata columns are not added to the plan unless they are used. By | ||
* checking only resolved nodes, this ensures that * expansion is already done so that metadata | ||
* columns are not accidentally selected by *. | ||
* columns are not accidentally selected by *. This rule resolves operators downwards to avoid | ||
* projecting away metadata columns prematurely. | ||
*/ | ||
object AddMetadataColumns extends Rule[LogicalPlan] { | ||
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ | ||
|
||
private def hasMetadataCol(plan: LogicalPlan): Boolean = { | ||
plan.expressions.exists(_.find { | ||
case a: Attribute => a.isMetadataCol | ||
case _ => false | ||
}.isDefined) | ||
} | ||
import org.apache.spark.sql.catalyst.util._ | ||
|
||
private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { | ||
case r: DataSourceV2Relation => r.withMetadataColumns() | ||
case _ => plan.withNewChildren(plan.children.map(addMetadataCol)) | ||
} | ||
|
||
def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { | ||
def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsDown { | ||
// Add metadata output to all node types | ||
case node if node.children.nonEmpty && node.resolved && hasMetadataCol(node) => | ||
val inputAttrs = AttributeSet(node.children.flatMap(_.output)) | ||
val metaCols = node.expressions.flatMap(_.collect { | ||
case a: Attribute if a.isMetadataCol && !inputAttrs.contains(a) => a | ||
}) | ||
val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains) | ||
if (metaCols.isEmpty) { | ||
node | ||
} else { | ||
val newNode = addMetadataCol(node) | ||
// We should not change the output schema of the plan. We should project away the extr | ||
// We should not change the output schema of the plan. We should project away the extra | ||
// metadata columns if necessary. | ||
if (newNode.sameOutput(node)) { | ||
newNode | ||
|
@@ -957,6 +946,36 @@ class Analyzer(override val catalogManager: CatalogManager) | |
} | ||
} | ||
} | ||
|
||
private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = { | ||
lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) | ||
plan.expressions.flatMap(_.collect { | ||
case a: Attribute if a.isMetadataCol => a | ||
case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This occurs in the case that a column is resolved below the level at which it becomes labeled as metadata. For the NATURAL/USING JOIN, this occurs when the column is resolved at the level of the root table - it is only labeled as hidden when it is used as a key column in the join. |
||
childMetadataOutput.find(_.exprId == a.exprId).get | ||
}) | ||
} | ||
|
||
private def hasMetadataCol(plan: LogicalPlan): Boolean = { | ||
lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) | ||
val hasMetaCol = plan.expressions.exists(_.find { | ||
case a: Attribute => | ||
// If an attribute is resolved before being labeled as metadata | ||
// (i.e. from the originating Dataset), we check with expression ID | ||
a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) | ||
cloud-fan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
case _ => false | ||
}.isDefined) | ||
hasMetaCol | ||
} | ||
|
||
private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { | ||
case r: DataSourceV2Relation => r.withMetadataColumns() | ||
case p: Project => | ||
p.copy( | ||
projectList = p.metadataOutput ++ p.projectList, | ||
child = addMetadataCol(p.child)) | ||
case _ => plan.withNewChildren(plan.children.map(addMetadataCol)) | ||
} | ||
} | ||
|
||
/** | ||
|
@@ -1897,10 +1916,10 @@ class Analyzer(override val catalogManager: CatalogManager) | |
} | ||
|
||
/** | ||
* This method tries to resolve expressions and find missing attributes recursively. Specially, | ||
* when the expressions used in `Sort` or `Filter` contain unresolved attributes or resolved | ||
* attributes which are missed from child output. This method tries to find the missing | ||
* attributes out and add into the projection. | ||
* This method tries to resolve expressions and find missing attributes recursively. | ||
* Specifically, when the expressions used in `Sort` or `Filter` contain unresolved attributes | ||
* or resolved attributes which are missing from child output. This method tries to find the | ||
* missing attributes and add them into the projection. | ||
*/ | ||
private def resolveExprsAndAddMissingAttrs( | ||
exprs: Seq[Expression], plan: LogicalPlan): (Seq[Expression], LogicalPlan) = { | ||
|
@@ -3144,7 +3163,9 @@ class Analyzer(override val catalogManager: CatalogManager) | |
joinType: JoinType, | ||
joinNames: Seq[String], | ||
condition: Option[Expression], | ||
hint: JoinHint) = { | ||
hint: JoinHint): LogicalPlan = { | ||
import org.apache.spark.sql.catalyst.util._ | ||
|
||
val leftKeys = joinNames.map { keyName => | ||
left.output.find(attr => resolver(attr.name, keyName)).getOrElse { | ||
throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, left, "left") | ||
|
@@ -3164,26 +3185,32 @@ class Analyzer(override val catalogManager: CatalogManager) | |
val rUniqueOutput = right.output.filterNot(att => rightKeys.contains(att)) | ||
|
||
// the output list looks like: join keys, columns from left, columns from right | ||
val projectList = joinType match { | ||
val (projectList, hiddenList) = joinType match { | ||
case LeftOuter => | ||
leftKeys ++ lUniqueOutput ++ rUniqueOutput.map(_.withNullability(true)) | ||
(leftKeys ++ lUniqueOutput ++ rUniqueOutput.map(_.withNullability(true)), rightKeys) | ||
case LeftExistence(_) => | ||
leftKeys ++ lUniqueOutput | ||
(leftKeys ++ lUniqueOutput, Seq.empty) | ||
case RightOuter => | ||
rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ rUniqueOutput | ||
(rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ rUniqueOutput, leftKeys) | ||
case FullOuter => | ||
// in full outer join, joinCols should be non-null if there is. | ||
val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, r)), l.name)() } | ||
joinedCols ++ | ||
(joinedCols ++ | ||
lUniqueOutput.map(_.withNullability(true)) ++ | ||
rUniqueOutput.map(_.withNullability(true)) | ||
rUniqueOutput.map(_.withNullability(true)), | ||
leftKeys ++ rightKeys) | ||
case _ : InnerLike => | ||
leftKeys ++ lUniqueOutput ++ rUniqueOutput | ||
(leftKeys ++ lUniqueOutput ++ rUniqueOutput, rightKeys) | ||
case _ => | ||
sys.error("Unsupported natural join type " + joinType) | ||
} | ||
// use Project to trim unnecessary fields | ||
Project(projectList, Join(left, right, joinType, newCondition, hint)) | ||
// use Project to hide duplicated common keys | ||
// propagate hidden columns from nested USING/NATURAL JOINs | ||
val project = Project(projectList, Join(left, right, joinType, newCondition, hint)) | ||
project.setTagValue( | ||
Project.hiddenOutputTag, | ||
hiddenList.map(_.asHiddenCol()) ++ project.child.metadataOutput.filter(_.isHiddenCol)) | ||
project | ||
} | ||
|
||
/** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -87,7 +87,7 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan => | |
if (!analyzed) { | ||
AnalysisHelper.allowInvokingTransformsInAnalyzer { | ||
val afterRuleOnChildren = mapChildren(_.resolveOperatorsUp(rule)) | ||
if (self fastEquals afterRuleOnChildren) { | ||
val newNode = if (self fastEquals afterRuleOnChildren) { | ||
CurrentOrigin.withOrigin(origin) { | ||
rule.applyOrElse(self, identity[LogicalPlan]) | ||
} | ||
|
@@ -96,6 +96,8 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan => | |
rule.applyOrElse(afterRuleOnChildren, identity[LogicalPlan]) | ||
} | ||
} | ||
newNode.copyTagsFrom(this) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This exists in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's a mistake. |
||
newNode | ||
} | ||
} else { | ||
self | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,7 +25,7 @@ import java.util.concurrent.atomic.AtomicBoolean | |
import org.apache.spark.internal.Logging | ||
import org.apache.spark.sql.catalyst.expressions._ | ||
import org.apache.spark.sql.internal.SQLConf | ||
import org.apache.spark.sql.types.{NumericType, StringType} | ||
import org.apache.spark.sql.types.{MetadataBuilder, NumericType, StringType} | ||
import org.apache.spark.unsafe.types.UTF8String | ||
import org.apache.spark.util.Utils | ||
|
||
|
@@ -201,4 +201,30 @@ package object util extends Logging { | |
def truncatedString[T](seq: Seq[T], sep: String, maxFields: Int): String = { | ||
truncatedString(seq, "", sep, "", maxFields) | ||
} | ||
|
||
val METADATA_COL_ATTR_KEY = "__metadata_col" | ||
|
||
/** | ||
* Hidden columns are a type of metadata column that are candidates during qualified star | ||
* star expansions. They are propagated through Projects that have hidden children output, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The comment needs update again. |
||
* so that nested hidden output is not lost. | ||
*/ | ||
val HIDDEN_COL_ATTR_KEY = "__hidden_col" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The semantic is clear now, let's refine the naming. We only have metadata column, and metadata column can be included in qualified star if required. We can just add a new property to metadata columns to indicate it. The property name can be
|
||
|
||
implicit class SpecialColumnHelper(attr: Attribute) { | ||
def isMetadataCol: Boolean = attr.metadata.contains(METADATA_COL_ATTR_KEY) && | ||
attr.metadata.getBoolean(METADATA_COL_ATTR_KEY) | ||
|
||
def isHiddenCol: Boolean = attr.isMetadataCol && | ||
attr.metadata.contains(HIDDEN_COL_ATTR_KEY) && | ||
attr.metadata.getBoolean(HIDDEN_COL_ATTR_KEY) | ||
|
||
def asHiddenCol(): Attribute = attr.withMetadata( | ||
new MetadataBuilder() | ||
.withMetadata(attr.metadata) | ||
.putBoolean(METADATA_COL_ATTR_KEY, true) | ||
.putBoolean(HIDDEN_COL_ATTR_KEY, true) | ||
.build() | ||
) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: we can avoid building a new
Seq
frequently. The check can beplan.children.exists(c => c.metadataOutput.exists(_.exprId == a.exprId))
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The same to
hasMetadataCol