-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-56510][SQL] Fix ReplaceData DML without metadata attributes not projecting out the operation column #55372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c70e1da
8b37604
f69f4e6
f70cc22
cae5e1e
69b2c42
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -51,6 +51,8 @@ class InMemoryRowLevelOperationTable( | |
| private final val INDEX_COLUMN_REF = FieldReference(IndexColumn.name) | ||
| private final val SUPPORTS_DELTAS = "supports-deltas" | ||
| private final val SPLIT_UPDATES = "split-updates" | ||
| private final val NO_METADATA = "no-metadata" | ||
| private final val noMetadata = properties.getOrDefault(NO_METADATA, "false") == "true" | ||
|
|
||
| // used in row-level operation tests to verify replaced partitions | ||
| var replacedPartitions: Seq[Seq[Any]] = Seq.empty | ||
|
|
@@ -73,7 +75,11 @@ class InMemoryRowLevelOperationTable( | |
| var configuredScan: InMemoryBatchScan = _ | ||
|
|
||
| override def requiredMetadataAttributes(): Array[NamedReference] = { | ||
| Array(PARTITION_COLUMN_REF, INDEX_COLUMN_REF) | ||
| if (noMetadata) { | ||
| Array.empty | ||
| } else { | ||
| Array(PARTITION_COLUMN_REF, INDEX_COLUMN_REF) | ||
| } | ||
| } | ||
|
|
||
| override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { | ||
|
|
@@ -89,22 +95,29 @@ class InMemoryRowLevelOperationTable( | |
| override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { | ||
| lastWriteInfo = info | ||
| new WriteBuilder { | ||
| override def build(): Write = new Write with RequiresDistributionAndOrdering { | ||
| override def requiredDistribution: Distribution = { | ||
| Distributions.clustered(Array(PARTITION_COLUMN_REF)) | ||
| override def build(): Write = if (noMetadata) { | ||
| new Write { | ||
| override def toBatch: BatchWrite = PartitionBasedReplaceData(configuredScan) | ||
| override def description: String = "InMemoryWrite" | ||
| } | ||
|
|
||
| override def requiredOrdering: Array[SortOrder] = { | ||
| Array[SortOrder]( | ||
| LogicalExpressions.sort( | ||
| PARTITION_COLUMN_REF, | ||
| SortDirection.ASCENDING, | ||
| SortDirection.ASCENDING.defaultNullOrdering())) | ||
| } else { | ||
| new Write with RequiresDistributionAndOrdering { | ||
| override def requiredDistribution: Distribution = { | ||
| Distributions.clustered(Array(PARTITION_COLUMN_REF)) | ||
| } | ||
|
|
||
| override def requiredOrdering: Array[SortOrder] = { | ||
| Array[SortOrder]( | ||
| LogicalExpressions.sort( | ||
| PARTITION_COLUMN_REF, | ||
| SortDirection.ASCENDING, | ||
| SortDirection.ASCENDING.defaultNullOrdering())) | ||
| } | ||
|
|
||
| override def toBatch: BatchWrite = PartitionBasedReplaceData(configuredScan) | ||
|
|
||
| override def description: String = "InMemoryWrite" | ||
| } | ||
|
|
||
| override def toBatch: BatchWrite = PartitionBasedReplaceData(configuredScan) | ||
|
|
||
| override def description: String = "InMemoryWrite" | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -138,7 +151,11 @@ class InMemoryRowLevelOperationTable( | |
| private final val PK_COLUMN_REF = FieldReference("pk") | ||
|
|
||
| override def requiredMetadataAttributes(): Array[NamedReference] = { | ||
| Array(PARTITION_COLUMN_REF, INDEX_COLUMN_REF) | ||
| if (noMetadata) { | ||
| Array.empty | ||
| } else { | ||
| Array(PARTITION_COLUMN_REF, INDEX_COLUMN_REF) | ||
| } | ||
| } | ||
|
|
||
| override def rowId(): Array[NamedReference] = Array(PK_COLUMN_REF) | ||
|
|
@@ -150,22 +167,28 @@ class InMemoryRowLevelOperationTable( | |
| override def newWriteBuilder(info: LogicalWriteInfo): DeltaWriteBuilder = { | ||
| lastWriteInfo = info | ||
| new DeltaWriteBuilder { | ||
| override def build(): DeltaWrite = new DeltaWrite with RequiresDistributionAndOrdering { | ||
|
|
||
| override def requiredDistribution(): Distribution = { | ||
| Distributions.clustered(Array(PARTITION_COLUMN_REF)) | ||
| override def build(): DeltaWrite = if (noMetadata) { | ||
| new DeltaWrite { | ||
| override def toBatch: DeltaBatchWrite = TestDeltaBatchWrite | ||
| } | ||
|
|
||
| override def requiredOrdering(): Array[SortOrder] = { | ||
| Array[SortOrder]( | ||
| LogicalExpressions.sort( | ||
| PARTITION_COLUMN_REF, | ||
| SortDirection.ASCENDING, | ||
| SortDirection.ASCENDING.defaultNullOrdering()) | ||
| ) | ||
| } else { | ||
| new DeltaWrite with RequiresDistributionAndOrdering { | ||
|
|
||
| override def requiredDistribution(): Distribution = { | ||
| Distributions.clustered(Array(PARTITION_COLUMN_REF)) | ||
| } | ||
|
|
||
| override def requiredOrdering(): Array[SortOrder] = { | ||
| Array[SortOrder]( | ||
| LogicalExpressions.sort( | ||
| PARTITION_COLUMN_REF, | ||
| SortDirection.ASCENDING, | ||
| SortDirection.ASCENDING.defaultNullOrdering()) | ||
| ) | ||
| } | ||
|
|
||
| override def toBatch: DeltaBatchWrite = TestDeltaBatchWrite | ||
| } | ||
|
|
||
| override def toBatch: DeltaBatchWrite = TestDeltaBatchWrite | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -208,21 +231,24 @@ private class DeltaBufferWriter(schema: StructType) extends BufferWriter(schema) | |
| override def delete(meta: InternalRow, id: InternalRow): Unit = { | ||
| val pk = id.getInt(0) | ||
| buffer.deletes += pk | ||
| val logEntry = new GenericInternalRow(Array[Any](DELETE, pk, meta.copy(), null)) | ||
| val metaCopy = if (meta != null) meta.copy() else null | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This null guard is needed because
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this falls outside the scope of this PR. |
||
| val logEntry = new GenericInternalRow(Array[Any](DELETE, pk, metaCopy, null)) | ||
| buffer.log += logEntry | ||
| } | ||
|
|
||
| override def update(meta: InternalRow, id: InternalRow, row: InternalRow): Unit = { | ||
| val pk = id.getInt(0) | ||
| buffer.deletes += pk | ||
| buffer.rows.append(row.copy()) | ||
| val logEntry = new GenericInternalRow(Array[Any](UPDATE, pk, meta.copy(), row.copy())) | ||
| val metaCopy = if (meta != null) meta.copy() else null | ||
| val logEntry = new GenericInternalRow(Array[Any](UPDATE, pk, metaCopy, row.copy())) | ||
| buffer.log += logEntry | ||
| } | ||
|
|
||
| override def reinsert(meta: InternalRow, row: InternalRow): Unit = { | ||
| buffer.rows.append(row.copy()) | ||
| val logEntry = new GenericInternalRow(Array[Any](REINSERT, null, meta.copy(), row.copy())) | ||
| val metaCopy = if (meta != null) meta.copy() else null | ||
| val logEntry = new GenericInternalRow(Array[Any](REINSERT, null, metaCopy, row.copy())) | ||
| buffer.log += logEntry | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
question: Is it intentional that the
noMetadatapath bypassesRequiresDistributionAndOrdering? This exercises a different physical plan (no shuffle/sort). If the goal is just to test the no-metadata code path, consider keeping the distribution/ordering requirements so these tests cover the same physical plan shape as the existing suites.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's intentional. Without metadata, we don't have
PARTITION_COLUMN_REF. Without it, we can't guarantee partitioning and can't know which column to use for distribution / ordering. This still exercises the code paths for different writing tasks, so it should be enough.