apache · AnishMahto · May 12, 2026 · May 12, 2026 · May 13, 2026 · May 13, 2026
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -191,6 +191,18 @@
     ],
     "sqlState" : "0A000"
   },
+  "AUTOCDC_COLUMNS_NOT_FOUND_IN_SCHEMA" : {
+    "message" : [
+      "Using <caseSensitivity> column name comparison, the following columns are not present in the <schemaName> schema: <missingColumns>. Available columns: <availableColumns>."
+    ],
+    "sqlState" : "42703"
+  },
+  "AUTOCDC_MULTIPART_COLUMN_IDENTIFIER" : {
+    "message" : [
+      "Expected a single column identifier; got the multi-part identifier <columnName> (parts: <nameParts>)."
+    ],
+    "sqlState" : "42703"
+  },
   "AVRO_CANNOT_WRITE_NULL_FIELD" : {
     "message" : [
       "Cannot write null value for field <name> defined as non-null Avro data type <dataType>.",

diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/ChangeArgs.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/ChangeArgs.scala
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.pipelines.autocdc
+
+import org.apache.spark.sql.{AnalysisException, Column}
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
+import org.apache.spark.sql.catalyst.util.QuotingUtils
+import org.apache.spark.sql.types.StructType
+
+/**
+ * A single, unqualified column identifier (no nested path or table/alias qualifier). Backticks
+ * are consumed: "`a.b`" is stored as "a.b" in [[name]]. Use [[name]] for direct schema-fieldName
+ * comparison and [[quoted]] for APIs that re-parse identifier strings.
+ */
+case class UnqualifiedColumnName private (name: String) {
+  def quoted: String = QuotingUtils.quoteIdentifier(name)
+}
+
+object UnqualifiedColumnName {
+  def apply(input: String): UnqualifiedColumnName = {
+    val nameParts = CatalystSqlParser.parseMultipartIdentifier(input)
+    if (nameParts.length != 1) {
+      throw multipartColumnIdentifierError(input, nameParts)
+    }
+    new UnqualifiedColumnName(nameParts.head)
+  }
+
+  private def multipartColumnIdentifierError(
+      columnName: String,
+      nameParts: Seq[String]
+  ): AnalysisException =
+    new AnalysisException(
+      errorClass = "AUTOCDC_MULTIPART_COLUMN_IDENTIFIER",
+      messageParameters = Map(
+        "columnName" -> columnName,
+        "nameParts" -> nameParts.mkString(", ")
+      )
+    )
+}
+
+sealed trait ColumnSelection
+object ColumnSelection {
+
+  case class IncludeColumns(columns: Seq[UnqualifiedColumnName]) extends ColumnSelection
+  case class ExcludeColumns(columns: Seq[UnqualifiedColumnName])
+      extends ColumnSelection
+
+  /**
+   * Applies [[ColumnSelection]] to a [[StructType]] and returns the filtered schema. Field order
+   * follows the original schema; only matching fields are retained in the returned schema.
+   *
+   * @param schemaName      Logical name of the schema being filtered, surfaced in error messages
+   *                        when columns are not found (e.g. "microbatch", "target").
+   * @param schema          The schema to filter.
+   * @param columnSelection The user-provided selection. `None` is a no-op and returns `schema`
+   *                        unchanged.
+   * @param caseSensitive   Whether to match column names case-sensitively against the schema.
+   *                        Callers should derive this from the session, e.g.
+   *                        `session.sessionState.conf.caseSensitiveAnalysis`, so column matching
+   *                        stays consistent with `spark.sql.caseSensitive`.
+   */
+  def applyToSchema(
+      schemaName: String,
+      schema: StructType,
+      columnSelection: Option[ColumnSelection],
+      caseSensitive: Boolean): StructType = columnSelection match {
+    case None =>
+      // A None column selection is interpreted as a no-op.
+      schema
+    case Some(IncludeColumns(cols)) =>
+      val keepIndices = lookupFieldIndices(schemaName, schema, cols, caseSensitive)
+      StructType(schema.fields.zipWithIndex.collect {
+        case (field, idx) if keepIndices.contains(idx) => field
+      })
+    case Some(ExcludeColumns(cols)) =>
+      val dropIndices = lookupFieldIndices(schemaName, schema, cols, caseSensitive)
+      StructType(schema.fields.zipWithIndex.collect {
+        case (field, idx) if !dropIndices.contains(idx) => field
+      })
+  }
+
+  private def lookupFieldIndices(
+      schemaName: String,
+      schema: StructType,
+      fields: Seq[UnqualifiedColumnName],
+      caseSensitive: Boolean): Set[Int] = {
+    val caseAwareGetFieldIndex: String => Option[Int] =
+      if (caseSensitive) schema.getFieldIndex else schema.getFieldIndexCaseInsensitive
+
+    val fieldIndexResolutions = fields.map(f => f -> caseAwareGetFieldIndex(f.name))
+    val missingFieldNames = fieldIndexResolutions.collect { case (f, None) => f.name }.distinct
+    if (missingFieldNames.nonEmpty) {
+      throw new AnalysisException(
+        errorClass = "AUTOCDC_COLUMNS_NOT_FOUND_IN_SCHEMA",
+        messageParameters = Map(
+          "caseSensitivity" -> CaseSensitivityLabels.of(caseSensitive),
+          "schemaName" -> schemaName,
+          "missingColumns" -> missingFieldNames.mkString(", "),
+          "availableColumns" -> schema.fieldNames.mkString(", ")
+        )
+      )
+    }
+    fieldIndexResolutions.flatMap { case (_, idx) => idx }.toSet
+  }
+}
+
+/** User-facing case-sensitivity labels surfaced in AutoCDC error messages. */
+private[autocdc] object CaseSensitivityLabels {
+  val CaseSensitive: String = "case-sensitive"
+  val CaseInsensitive: String = "case-insensitive"
+
+  def of(caseSensitive: Boolean): String =
+    if (caseSensitive) CaseSensitive else CaseInsensitive
+}
+
+/** The SCD (Slowly Changing Dimension) strategy for a CDC flow. */
+sealed trait ScdType
+
+object ScdType {
+  /** Representation for the standard SCD1 strategy. */
+  case object Type1 extends ScdType
+  /** Representation for the standard SCD2 strategy. */
+  case object Type2 extends ScdType
+}
+
+/**
+ * Configuration for an AutoCDC flow.
+ *
+ * @param keys            The column(s) that uniquely identify a row in the source data.
+ * @param sequencing      Expression ordering CDC events to correctly resolve out-of-order
+ *                        arrivals. Must be a sortable type.
+ * @param deleteCondition Expression that marks a source row as a DELETE. When None, all
+ *                        rows are treated as upserts.
+ * @param storedAsScdType The SCD strategy these args should be applied to.
+ * @param columnSelection Which source columns to select in the target table. None means
+ *                        all columns.
+ */
+case class ChangeArgs(
+    keys: Seq[UnqualifiedColumnName],
+    sequencing: Column,
+    storedAsScdType: ScdType,
+    deleteCondition: Option[Column] = None,
+    columnSelection: Option[ColumnSelection] = None
+)
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/autocdc/Scd1BatchProcessor.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.pipelines.autocdc
+
+import org.apache.spark.sql.{functions => F}
+import org.apache.spark.sql.catalyst.util.QuotingUtils
+import org.apache.spark.sql.classic.DataFrame
+import org.apache.spark.util.ArrayImplicits._
+
+/**
+ * Per-microbatch processor for SCD Type 1 AutoCDC flows, complying to the specified [[changeArgs]]
+ * configuration.
+ */
+case class Scd1BatchProcessor(changeArgs: ChangeArgs) {
+  /**
+   * Deduplicate the incoming CDC microbatch by key, keeping the most recent event per key
+   * as ordered by [[ChangeArgs.sequencing]].
+   *
+   * For SCD1 we only care about the most recent (by sequence value) event per key. When
+   * multiple events share the same key and the same sequence value, the row selected is
+   * non-deterministic and undefined.
+   *
+   * @param validatedMicrobatch A microbatch that has already been validated such that the
+   *                            sequencing column should not contain null values, and its data type
+   *                            should support ordering.
+   *
+   * The schema of the returned dataframe matches the schema of the microbatch exactly.
+   */
+  def deduplicateMicrobatch(validatedMicrobatch: DataFrame): DataFrame = {
+    // The `max_by` API can only return a single column, so pack/unpack the entire row into a
+    // temporary column before and after the `max_by` operation.
+    val winningRowCol = Scd1BatchProcessor.winningRowColName
+
+    val allMicrobatchColumns =
+      validatedMicrobatch.columns
+        .map(colName => F.col(QuotingUtils.quoteIdentifier(colName)))
+        .toImmutableArraySeq
+
+    validatedMicrobatch
+      .groupBy(changeArgs.keys.map(k => F.col(k.quoted)): _*)
+      .agg(
+        F.max_by(F.struct(allMicrobatchColumns: _*), changeArgs.sequencing)
+          .as(winningRowCol)
+      )
+      .select(F.col(s"$winningRowCol.*"))
+  }
+}
+
+object Scd1BatchProcessor {
+  // Columns prefixed with `__spark_autocdc_` are reserved for internal SDP AutoCDC processing.
+  private val winningRowColName = "__spark_autocdc_winning_row"
+}